diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 77caf3ef82d3..b555df825447 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -486,6 +486,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c2a4b652bd1a..ce3e05e41724 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1170,6 +1170,13 @@ PAGE_SIZE multiple when read back. Under certain circumstances, the usage may go over the limit temporarily. + In default configuration regular 0-order allocations always + succeed unless OOM killer chooses current task as a victim. + + Some kinds of allocations don't invoke the OOM killer. + Caller could retry them differently, return into userspace + as -ENOMEM or silently ignore in cases like disk readahead. + This is the ultimate protection mechanism. As long as the high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. @@ -1226,17 +1233,9 @@ PAGE_SIZE multiple when read back. The number of time the cgroup's memory usage was reached the limit and allocation was about to fail. - Depending on context result could be invocation of OOM - killer and retrying allocation or failing allocation. - - Failed allocation in its turn could be returned into - userspace as -ENOMEM or silently ignored in cases like - disk readahead. For now OOM in memory cgroup kills - tasks iff shortage has happened inside page fault. - This event is not raised if the OOM killer is not considered as an option, e.g. for failed high-order - allocations. + allocations or if caller asked to not retry attempts. oom_kill The number of processes belonging to this cgroup diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 0795e3c2643f..ca4dbdd9016d 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -14,3 +14,4 @@ are configurable at compile, boot or run time. mds tsx_async_abort multihit.rst + special-register-buffer-data-sampling.rst diff --git a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst new file mode 100644 index 000000000000..47b1b3afac99 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst @@ -0,0 +1,149 @@ +.. SPDX-License-Identifier: GPL-2.0 + +SRBDS - Special Register Buffer Data Sampling +============================================= + +SRBDS is a hardware vulnerability that allows MDS :doc:`mds` techniques to +infer values returned from special register accesses. Special register +accesses are accesses to off core registers. According to Intel's evaluation, +the special register reads that have a security expectation of privacy are +RDRAND, RDSEED and SGX EGETKEY. + +When RDRAND, RDSEED and EGETKEY instructions are used, the data is moved +to the core through the special register mechanism that is susceptible +to MDS attacks. + +Affected processors +-------------------- +Core models (desktop, mobile, Xeon-E3) that implement RDRAND and/or RDSEED may +be affected. + +A processor is affected by SRBDS if its Family_Model and stepping is +in the following list, with the exception of the listed processors +exporting MDS_NO while Intel TSX is available yet not enabled. The +latter class of processors are only affected when Intel TSX is enabled +by software using TSX_CTRL_MSR otherwise they are not affected. + + ============= ============ ======== + common name Family_Model Stepping + ============= ============ ======== + IvyBridge 06_3AH All + + Haswell 06_3CH All + Haswell_L 06_45H All + Haswell_G 06_46H All + + Broadwell_G 06_47H All + Broadwell 06_3DH All + + Skylake_L 06_4EH All + Skylake 06_5EH All + + Kabylake_L 06_8EH <= 0xC + Kabylake 06_9EH <= 0xD + ============= ============ ======== + +Related CVEs +------------ + +The following CVE entry is related to this SRBDS issue: + + ============== ===== ===================================== + CVE-2020-0543 SRBDS Special Register Buffer Data Sampling + ============== ===== ===================================== + +Attack scenarios +---------------- +An unprivileged user can extract values returned from RDRAND and RDSEED +executed on another core or sibling thread using MDS techniques. + + +Mitigation mechanism +------------------- +Intel will release microcode updates that modify the RDRAND, RDSEED, and +EGETKEY instructions to overwrite secret special register data in the shared +staging buffer before the secret data can be accessed by another logical +processor. + +During execution of the RDRAND, RDSEED, or EGETKEY instructions, off-core +accesses from other logical processors will be delayed until the special +register read is complete and the secret data in the shared staging buffer is +overwritten. + +This has three effects on performance: + +#. RDRAND, RDSEED, or EGETKEY instructions have higher latency. + +#. Executing RDRAND at the same time on multiple logical processors will be + serialized, resulting in an overall reduction in the maximum RDRAND + bandwidth. + +#. Executing RDRAND, RDSEED or EGETKEY will delay memory accesses from other + logical processors that miss their core caches, with an impact similar to + legacy locked cache-line-split accesses. + +The microcode updates provide an opt-out mechanism (RNGDS_MITG_DIS) to disable +the mitigation for RDRAND and RDSEED instructions executed outside of Intel +Software Guard Extensions (Intel SGX) enclaves. On logical processors that +disable the mitigation using this opt-out mechanism, RDRAND and RDSEED do not +take longer to execute and do not impact performance of sibling logical +processors memory accesses. The opt-out mechanism does not affect Intel SGX +enclaves (including execution of RDRAND or RDSEED inside an enclave, as well +as EGETKEY execution). + +IA32_MCU_OPT_CTRL MSR Definition +-------------------------------- +Along with the mitigation for this issue, Intel added a new thread-scope +IA32_MCU_OPT_CTRL MSR, (address 0x123). The presence of this MSR and +RNGDS_MITG_DIS (bit 0) is enumerated by CPUID.(EAX=07H,ECX=0).EDX[SRBDS_CTRL = +9]==1. This MSR is introduced through the microcode update. + +Setting IA32_MCU_OPT_CTRL[0] (RNGDS_MITG_DIS) to 1 for a logical processor +disables the mitigation for RDRAND and RDSEED executed outside of an Intel SGX +enclave on that logical processor. Opting out of the mitigation for a +particular logical processor does not affect the RDRAND and RDSEED mitigations +for other logical processors. + +Note that inside of an Intel SGX enclave, the mitigation is applied regardless +of the value of RNGDS_MITG_DS. + +Mitigation control on the kernel command line +--------------------------------------------- +The kernel command line allows control over the SRBDS mitigation at boot time +with the option "srbds=". The option for this is: + + ============= ============================================================= + off This option disables SRBDS mitigation for RDRAND and RDSEED on + affected platforms. + ============= ============================================================= + +SRBDS System Information +----------------------- +The Linux kernel provides vulnerability status information through sysfs. For +SRBDS this can be accessed by the following sysfs file: +/sys/devices/system/cpu/vulnerabilities/srbds + +The possible values contained in this file are: + + ============================== ============================================= + Not affected Processor not vulnerable + Vulnerable Processor vulnerable and mitigation disabled + Vulnerable: No microcode Processor vulnerable and microcode is missing + mitigation + Mitigation: Microcode Processor is vulnerable and mitigation is in + effect. + Mitigation: TSX disabled Processor is only vulnerable when TSX is + enabled while this system was booted with TSX + disabled. + Unknown: Dependent on + hypervisor status Running on virtual guest processor that is + affected but with no way to know if host + processor is mitigated or vulnerable. + ============================== ============================================= + +SRBDS Default mitigation +------------------------ +This new microcode serializes processor access during execution of RDRAND, +RDSEED ensures that the shared buffer is overwritten before it is released for +reuse. Use the "srbds=off" kernel command line to disable the mitigation for +RDRAND and RDSEED. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d2935..2da65fef2a1c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f63..fb95fad81c79 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,9 +1513,9 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). @@ -3715,6 +3728,8 @@ may put more devices in an IOMMU group. force_floating [S390] Force usage of floating interrupts. nomio [S390] Do not use MIO instructions. + norid [S390] ignore the RID field and force use of + one PCI domain per PCI function pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. @@ -4652,9 +4667,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4663,7 +4678,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst @@ -4822,6 +4837,26 @@ the kernel will oops in either "warn" or "fatal" mode. + srbds= [X86,INTEL] + Control the Special Register Buffer Data Sampling + (SRBDS) mitigation. + + Certain CPUs are vulnerable to an MDS-like + exploit which can leak bits from the random + number generator. + + By default, this issue is mitigated by + microcode. However, the microcode fix can cause + the RDRAND and RDSEED instructions to become + much slower. Among other effects, this will + result in reduced throughput from /dev/urandom. + + The microcode mitigation can be disabled with + the following option: + + off: Disable mitigation and remove + performance impact to RDRAND and RDSEED + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the @@ -4956,6 +4991,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 8463f5538fda..067a90a1499c 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -364,19 +364,19 @@ follows: 2) for querying the policy, we do not need to take an extra reference on the target task's task policy nor vma policies because we always acquire the - task's mm's mmap_sem for read during the query. The set_mempolicy() and - mbind() APIs [see below] always acquire the mmap_sem for write when + task's mm's mmap_lock for read during the query. The set_mempolicy() and + mbind() APIs [see below] always acquire the mmap_lock for write when installing or replacing task or vma policies. Thus, there is no possibility of a task or thread freeing a policy while another task or thread is querying it. 3) Page allocation usage of task or vma policy occurs in the fault path where - we hold them mmap_sem for read. Again, because replacing the task or vma - policy requires that the mmap_sem be held for write, the policy can't be + we hold them mmap_lock for read. Again, because replacing the task or vma + policy requires that the mmap_lock be held for write, the policy can't be freed out from under us while we're using it for page allocation. 4) Shared policies require special consideration. One task can replace a - shared memory policy while another task, with a distinct mmap_sem, is + shared memory policy while another task, with a distinct mmap_lock, is querying or allocating a page based on the policy. To resolve this potential race, the shared policy infrastructure adds an extra reference to the shared policy during lookup while holding a spin lock on the shared diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 0bf49d7313ad..1dc2d5f823b4 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -33,7 +33,7 @@ memory ranges) provides two primary functionalities: The real advantage of userfaults if compared to regular virtual memory management of mremap/mprotect is that the userfaults in all their operations never involve heavyweight structures like vmas (in fact the -``userfaultfd`` runtime load never takes the mmap_sem for writing). +``userfaultfd`` runtime load never takes the mmap_lock for writing). Vmas are not suitable for page- (or hugepage) granular fault tracking when dealing with virtual address spaces that could span diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d01141..83acf5025488 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== @@ -632,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== @@ -1239,6 +1269,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86..6068266dd303 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,46 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. -Therefore, neither flag needs to be set. +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. + +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +write to a page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + write to the data within the pages + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + write to the data within the pages + put_page() page_maybe_dma_pinned(): the whole point of pinning =================================================== diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 5d1f56fcd2e7..469d115a95f1 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -151,6 +151,29 @@ note some tests will require root privileges:: $ cd kselftest $ ./run_kselftest.sh +Packaging selftests +=================== + +In some cases packaging is desired, such as when tests need to run on a +different system. To package selftests, run:: + + $ make -C tools/testing/selftests gen_tar + +This generates a tarball in the `INSTALL_PATH/kselftest-packages` directory. By +default, `.gz` format is used. The tar format can be overridden by specifying +a `FORMAT` make variable. Any value recognized by `tar's auto-compress`_ option +is supported, such as:: + + $ make -C tools/testing/selftests gen_tar FORMAT=.xz + +`make gen_tar` invokes `make install` so you can use it to package a subset of +tests by using variables specified in `Running a subset of selftests`_ +section:: + + $ make -C tools/testing/selftests gen_tar TARGETS="bpf" FORMAT=.xz + +.. _tar's auto-compress: https://www.gnu.org/software/tar/manual/html_node/gzip.html#auto_002dcompress + Contributing new tests ====================== diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst index e1c5ce80ce12..bb112cf70624 100644 --- a/Documentation/dev-tools/kunit/start.rst +++ b/Documentation/dev-tools/kunit/start.rst @@ -32,15 +32,17 @@ test targets as well. The ``.kunitconfig`` should also contain any other config options required by the tests. A good starting point for a ``.kunitconfig`` is the KUnit defconfig: + .. code-block:: bash cd $PATH_TO_LINUX_REPO cp arch/um/configs/kunit_defconfig .kunitconfig You can then add any other Kconfig options you wish, e.g.: + .. code-block:: none - CONFIG_LIST_KUNIT_TEST=y + CONFIG_LIST_KUNIT_TEST=y :doc:`kunit_tool ` will ensure that all config options set in ``.kunitconfig`` are set in the kernel ``.config`` before running the tests. @@ -54,8 +56,8 @@ using. other tools (such as make menuconfig) to adjust other config options. -Running the tests ------------------ +Running the tests (KUnit Wrapper) +--------------------------------- To make sure that everything is set up correctly, simply invoke the Python wrapper from your kernel repo: @@ -105,8 +107,9 @@ have config options ending in ``_KUNIT_TEST``. KUnit and KUnit tests can be compiled as modules: in this case the tests in a module will be run when the module is loaded. -Running the tests ------------------ + +Running the tests (w/o KUnit Wrapper) +------------------------------------- Build and run your kernel as usual. Test output will be written to the kernel log in `TAP `_ format. diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst index 473a2361ec37..3c3fe8b5fecc 100644 --- a/Documentation/dev-tools/kunit/usage.rst +++ b/Documentation/dev-tools/kunit/usage.rst @@ -595,7 +595,7 @@ able to run one test case per invocation. KUnit debugfs representation ============================ When kunit test suites are initialized, they create an associated directory -in /sys/kernel/debug/kunit/. The directory contains one file +in ``/sys/kernel/debug/kunit/``. The directory contains one file - results: "cat results" displays results of each test case and the results of the entire suite for the last test run. @@ -604,4 +604,4 @@ The debugfs representation is primarily of use when kunit test suites are run in a native environment, either as modules or builtin. Having a way to display results like this is valuable as otherwise results can be intermixed with other events in dmesg output. The maximum size of each -results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h). +results file is KUNIT_LOG_SIZE bytes (defined in ``include/kunit/test.h``). diff --git a/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml new file mode 100644 index 000000000000..5e125cf2a88b --- /dev/null +++ b/Documentation/devicetree/bindings/iommu/allwinner,sun50i-h6-iommu.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/iommu/allwinner,sun50i-h6-iommu.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Allwinner H6 IOMMU Device Tree Bindings + +maintainers: + - Chen-Yu Tsai + - Maxime Ripard + +properties: + "#iommu-cells": + const: 1 + description: + The content of the cell is the master ID. + + compatible: + const: allwinner,sun50i-h6-iommu + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + +required: + - "#iommu-cells" + - compatible + - reg + - interrupts + - clocks + - resets + +additionalProperties: false + +examples: + - | + #include + #include + + #include + #include + + iommu: iommu@30f0000 { + compatible = "allwinner,sun50i-h6-iommu"; + reg = <0x030f0000 0x10000>; + interrupts = ; + clocks = <&ccu CLK_BUS_IOMMU>; + resets = <&ccu RST_BUS_IOMMU>; + #iommu-cells = <1>; + }; + +... diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml index 218f7ecb4703..d7ceb4c34423 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml @@ -42,7 +42,9 @@ properties: - const: arm,mmu-500 - const: arm,smmu-v2 - items: - - const: arm,mmu-401 + - enum: + - arm,mmu-400 + - arm,mmu-401 - const: arm,smmu-v1 - enum: - arm,smmu-v1 diff --git a/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml new file mode 100644 index 000000000000..c019f9fbe916 --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/ingenic,vpu.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/remoteproc/ingenic,vpu.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: Ingenic Video Processing Unit bindings + +description: + Inside the Video Processing Unit (VPU) of the recent JZ47xx SoCs from + Ingenic is a second Xburst MIPS CPU very similar to the main core. + This document describes the devicetree bindings for this auxiliary + processor. + +maintainers: + - Paul Cercueil + +properties: + compatible: + const: ingenic,jz4770-vpu-rproc + + reg: + items: + - description: aux registers + - description: tcsm0 registers + - description: tcsm1 registers + - description: sram registers + + reg-names: + items: + - const: aux + - const: tcsm0 + - const: tcsm1 + - const: sram + + clocks: + items: + - description: aux clock + - description: vpu clock + + clock-names: + items: + - const: aux + - const: vpu + + interrupts: + description: VPU hardware interrupt + +required: + - compatible + - reg + - reg-names + - clocks + - clock-names + - interrupts + +additionalProperties: false + +examples: + - | + #include + + vpu: video-decoder@132a0000 { + compatible = "ingenic,jz4770-vpu-rproc"; + + reg = <0x132a0000 0x20>, /* AUX */ + <0x132b0000 0x4000>, /* TCSM0 */ + <0x132c0000 0xc000>, /* TCSM1 */ + <0x132f0000 0x7000>; /* SRAM */ + reg-names = "aux", "tcsm0", "tcsm1", "sram"; + + clocks = <&cgu JZ4770_CLK_AUX>, <&cgu JZ4770_CLK_VPU>; + clock-names = "aux", "vpu"; + + interrupt-parent = <&cpuintc>; + interrupts = <3>; + }; diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 9938918b2fea..54737024da20 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -15,12 +15,16 @@ on the Qualcomm ADSP Hexagon core. "qcom,qcs404-adsp-pas" "qcom,qcs404-cdsp-pas" "qcom,qcs404-wcss-pas" + "qcom,sc7180-mpss-pas" "qcom,sdm845-adsp-pas" "qcom,sdm845-cdsp-pas" "qcom,sm8150-adsp-pas" "qcom,sm8150-cdsp-pas" "qcom,sm8150-mpss-pas" "qcom,sm8150-slpi-pas" + "qcom,sm8250-adsp-pas" + "qcom,sm8250-cdsp-pas" + "qcom,sm8250-slpi-pas" - interrupts-extended: Usage: required @@ -44,8 +48,12 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-adsp-pas: + qcom,sm8250-cdsp-pas: + qcom,sm8250-slpi-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack" qcom,qcs404-wcss-pas: + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack", "shutdown-ack" @@ -105,10 +113,14 @@ on the Qualcomm ADSP Hexagon core. qcom,sdm845-cdsp-pas: qcom,sm8150-adsp-pas: qcom,sm8150-cdsp-pas: + qcom,sm8250-cdsp-pas: must be "cx", "load_state" + qcom,sc7180-mpss-pas: qcom,sm8150-mpss-pas: must be "cx", "load_state", "mss" + qcom,sm8250-adsp-pas: qcom,sm8150-slpi-pas: + qcom,sm8250-slpi-pas: must be "lcx", "lmx", "load_state" - memory-region: diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 88dfa3fc15f7..1f9a62e13ebe 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -79,7 +79,7 @@ on the Qualcomm Hexagon core. "snoc_axi", "mnoc_axi", "qdss" qcom,sc7180-mss-pil: must be "iface", "bus", "xo", "snoc_axi", "mnoc_axi", - "mss_crypto", "mss_nav", "nav" + "nav" qcom,sdm845-mss-pil: must be "iface", "bus", "mem", "xo", "gpll0_mss", "snoc_axi", "mnoc_axi", "prng" @@ -102,6 +102,14 @@ on the Qualcomm Hexagon core. must be "mss_restart", "pdc_reset" for the modem sub-system on SC7180, SDM845 SoCs +For devices where the mba and mpss sub-nodes are not specified, mba/mpss region +should be referenced as follows: +- memory-region: + Usage: required + Value type: + Definition: reference to the reserved-memory for the mba region followed + by the mpss region + For the compatible strings below the following supplies are required: "qcom,q6v5-pil" "qcom,msm8916-mss-pil", @@ -173,16 +181,15 @@ For the compatible string below the following supplies are required: For the compatible strings below the following phandle references are required: "qcom,sc7180-mss-pil" -- qcom,halt-nav-regs: +- qcom,spare-regs: Usage: required Value type: - Definition: reference to a list of 2 phandles with one offset each for - the modem sub-system running on SC7180 SoC. The first - phandle reference is to the mss clock node followed by the - offset within register space for nav halt register. The - second phandle reference is to a syscon representing TCSR - followed by the offset within syscon for conn_box_spare0 - register. + Definition: a phandle reference to a syscon representing TCSR followed + by the offset within syscon for conn_box_spare0 register + used by the modem sub-system running on SC7180 SoC. + +The Hexagon node must contain iommus property as described in ../iommu/iommu.txt +on platforms which do not have TrustZone. = SUBNODES: The Hexagon node must contain two subnodes, named "mba" and "mpss" representing diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.rst similarity index 63% rename from Documentation/filesystems/gfs2-glocks.txt rename to Documentation/filesystems/gfs2-glocks.rst index 7059623635b2..d14f230f0b12 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.rst @@ -1,5 +1,8 @@ - Glock internal locking rules - ------------------------------ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +Glock internal locking rules +============================ This documents the basic principles of the glock state machine internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h) @@ -24,24 +27,28 @@ There are three lock states that users of the glock layer can request, namely shared (SH), deferred (DF) and exclusive (EX). Those translate to the following DLM lock modes: -Glock mode | DLM lock mode ------------------------------- - UN | IV/NL Unlocked (no DLM lock associated with glock) or NL - SH | PR (Protected read) - DF | CW (Concurrent write) - EX | EX (Exclusive) +========== ====== ===================================================== +Glock mode DLM lock mode +========== ====== ===================================================== + UN IV/NL Unlocked (no DLM lock associated with glock) or NL + SH PR (Protected read) + DF CW (Concurrent write) + EX EX (Exclusive) +========== ====== ===================================================== Thus DF is basically a shared mode which is incompatible with the "normal" shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O operations. The glocks are basically a lock plus some routines which deal with cache management. The following rules apply for the cache: -Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata --------------------------------------------------------------------------- - UN | No | No | No | No - SH | Yes | Yes | No | No - DF | No | Yes | No | No - EX | Yes | Yes | Yes | Yes +========== ========== ============== ========== ============== +Glock mode Cache data Cache Metadata Dirty Data Dirty Metadata +========== ========== ============== ========== ============== + UN No No No No + SH Yes Yes No No + DF No Yes No No + EX Yes Yes Yes Yes +========== ========== ============== ========== ============== These rules are implemented using the various glock operations which are defined for each type of glock. Not all types of glocks use @@ -49,21 +56,23 @@ all the modes. Only inode glocks use the DF mode for example. Table of glock operations and per type constants: -Field | Purpose ----------------------------------------------------------------------------- -go_xmote_th | Called before remote state change (e.g. to sync dirty data) -go_xmote_bh | Called after remote state change (e.g. to refill cache) -go_inval | Called if remote state change requires invalidating the cache -go_demote_ok | Returns boolean value of whether its ok to demote a glock - | (e.g. checks timeout, and that there is no cached data) -go_lock | Called for the first local holder of a lock -go_unlock | Called on the final local unlock of a lock -go_dump | Called to print content of object for debugfs file, or on - | error to dump glock to the log. -go_type | The type of the glock, LM_TYPE_..... -go_callback | Called if the DLM sends a callback to drop this lock -go_flags | GLOF_ASPACE is set, if the glock has an address space - | associated with it +============= ============================================================= +Field Purpose +============= ============================================================= +go_xmote_th Called before remote state change (e.g. to sync dirty data) +go_xmote_bh Called after remote state change (e.g. to refill cache) +go_inval Called if remote state change requires invalidating the cache +go_demote_ok Returns boolean value of whether its ok to demote a glock + (e.g. checks timeout, and that there is no cached data) +go_lock Called for the first local holder of a lock +go_unlock Called on the final local unlock of a lock +go_dump Called to print content of object for debugfs file, or on + error to dump glock to the log. +go_type The type of the glock, ``LM_TYPE_*`` +go_callback Called if the DLM sends a callback to drop this lock +go_flags GLOF_ASPACE is set, if the glock has an address space + associated with it +============= ============================================================= The minimum hold time for each lock is the time after a remote lock grant for which we ignore remote demote requests. This is in order to @@ -82,21 +91,25 @@ rather than via the glock. Locking rules for glock operations: -Operation | GLF_LOCK bit lock held | gl_lockref.lock spinlock held -------------------------------------------------------------------------- -go_xmote_th | Yes | No -go_xmote_bh | Yes | No -go_inval | Yes | No -go_demote_ok | Sometimes | Yes -go_lock | Yes | No -go_unlock | Yes | No -go_dump | Sometimes | Yes -go_callback | Sometimes (N/A) | Yes +============= ====================== ============================= +Operation GLF_LOCK bit lock held gl_lockref.lock spinlock held +============= ====================== ============================= +go_xmote_th Yes No +go_xmote_bh Yes No +go_inval Yes No +go_demote_ok Sometimes Yes +go_lock Yes No +go_unlock Yes No +go_dump Sometimes Yes +go_callback Sometimes (N/A) Yes +============= ====================== ============================= -N.B. Operations must not drop either the bit lock or the spinlock -if its held on entry. go_dump and do_demote_ok must never block. -Note that go_dump will only be called if the glock's state -indicates that it is caching uptodate data. +.. Note:: + + Operations must not drop either the bit lock or the spinlock + if its held on entry. go_dump and do_demote_ok must never block. + Note that go_dump will only be called if the glock's state + indicates that it is caching uptodate data. Glock locking order within GFS2: @@ -104,7 +117,7 @@ Glock locking order within GFS2: 2. Rename glock (for rename only) 3. Inode glock(s) (Parents before children, inodes at "same level" with same parent in - lock number order) + lock number order) 4. Rgrp glock(s) (for (de)allocation operations) 5. Transaction glock (via gfs2_trans_begin) for non-read operations 6. i_rw_mutex (if required) @@ -117,8 +130,8 @@ determine the lifetime of the inode in question. Locking of inodes is on a per-inode basis. Locking of rgrps is on a per rgrp basis. In general we prefer to lock local locks prior to cluster locks. - Glock Statistics - ------------------ +Glock Statistics +---------------- The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The @@ -173,8 +186,8 @@ we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for -allocation (to base it on lock wait time, rather than blindly -using a "try lock") + allocation (to base it on lock wait time, rather than blindly + using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken @@ -195,10 +208,13 @@ as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. -Per sb stats can be found here: -/sys/kernel/debug/gfs2//sbstats -Per glock stats can be found here: -/sys/kernel/debug/gfs2//glstats +Per sb stats can be found here:: + + /sys/kernel/debug/gfs2//sbstats + +Per glock stats can be found here:: + + /sys/kernel/debug/gfs2//glstats Assuming that debugfs is mounted on /sys/kernel/debug and also that is replaced with the name of the gfs2 filesystem @@ -206,14 +222,16 @@ in question. The abbreviations used in the output as are follows: -srtt - Smoothed round trip time for non-blocking dlm requests -srttvar - Variance estimate for srtt -srttb - Smoothed round trip time for (potentially) blocking dlm requests -srttvarb - Variance estimate for srttb -sirt - Smoothed inter-request time (for dlm requests) -sirtvar - Variance estimate for sirt -dlm - Number of dlm requests made (dcnt in glstats file) -queue - Number of glock requests queued (qcnt in glstats file) +========= ================================================================ +srtt Smoothed round trip time for non blocking dlm requests +srttvar Variance estimate for srtt +srttb Smoothed round trip time for (potentially) blocking dlm requests +srttvarb Variance estimate for srttb +sirt Smoothed inter request time (for dlm requests) +sirtvar Variance estimate for sirt +dlm Number of dlm requests made (dcnt in glstats file) +queue Number of glock requests queued (qcnt in glstats file) +========= ================================================================ The sbstats file contains a set of these stats for each glock type (so 8 lines for each type) and for each cpu (one column per cpu). The glstats file contains @@ -224,9 +242,12 @@ The gfs2_glock_lock_time tracepoint prints out the current values of the stats for the glock in question, along with some addition information on each dlm reply that is received: -status - The status of the dlm request -flags - The dlm request flags -tdiff - The time taken by this specific request +====== ======================================= +status The status of the dlm request +flags The dlm request flags +tdiff The time taken by this specific request +====== ======================================= + (remaining fields as per above list) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 17795341e0a3..4c536e66dc4c 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -88,6 +88,7 @@ Documentation for filesystem implementations. f2fs gfs2 gfs2-uevents + gfs2-glocks hfs hfsplus hpfs diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 67657f216942..0a23fe922f42 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -615,7 +615,7 @@ prototypes:: locking rules: ============= ======== =========================== -ops mmap_sem PageLocked(page) +ops mmap_lock PageLocked(page) ============= ======== =========================== open: yes close: yes diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst index f7af2061e406..cf71df5776b4 100644 --- a/Documentation/s390/index.rst +++ b/Documentation/s390/index.rst @@ -15,6 +15,7 @@ s390 Architecture vfio-ccw zfcpdump common_io + pci text_files diff --git a/Documentation/s390/pci.rst b/Documentation/s390/pci.rst new file mode 100644 index 000000000000..492850bff316 --- /dev/null +++ b/Documentation/s390/pci.rst @@ -0,0 +1,125 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +S/390 PCI +========= + +Authors: + - Pierre Morel + +Copyright, IBM Corp. 2020 + + +Command line parameters and debugfs entries +=========================================== + +Command line parameters +----------------------- + +* nomio + + Do not use PCI Mapped I/O (MIO) instructions. + +* norid + + Ignore the RID field and force use of one PCI domain per PCI function. + +debugfs entries +--------------- + +The S/390 debug feature (s390dbf) generates views to hold various debug results in sysfs directories of the form: + + * /sys/kernel/debug/s390dbf/pci_*/ + +For example: + + - /sys/kernel/debug/s390dbf/pci_msg/sprintf + Holds messages from the processing of PCI events, like machine check handling + and setting of global functionality, like UID checking. + + Change the level of logging to be more or less verbose by piping + a number between 0 and 6 to /sys/kernel/debug/s390dbf/pci_*/level. For + details, see the documentation on the S/390 debug feature at + Documentation/s390/s390dbf.rst. + +Sysfs entries +============= + +Entries specific to zPCI functions and entries that hold zPCI information. + +* /sys/bus/pci/slots/XXXXXXXX + + The slot entries are set up using the function identifier (FID) of the + PCI function. + + - /sys/bus/pci/slots/XXXXXXXX/power + + A physical function that currently supports a virtual function cannot be + powered off until all virtual functions are removed with: + echo 0 > /sys/bus/pci/devices/XXXX:XX:XX.X/sriov_numvf + +* /sys/bus/pci/devices/XXXX:XX:XX.X/ + + - function_id + A zPCI function identifier that uniquely identifies the function in the Z server. + + - function_handle + Low-level identifier used for a configured PCI function. + It might be useful for debuging. + + - pchid + Model-dependent location of the I/O adapter. + + - pfgid + PCI function group ID, functions that share identical functionality + use a common identifier. + A PCI group defines interrupts, IOMMU, IOTLB, and DMA specifics. + + - vfn + The virtual function number, from 1 to N for virtual functions, + 0 for physical functions. + + - pft + The PCI function type + + - port + The port corresponds to the physical port the function is attached to. + It also gives an indication of the physical function a virtual function + is attached to. + + - uid + The unique identifier (UID) is defined when configuring an LPAR and is + unique in the LPAR. + + - pfip/segmentX + The segments determine the isolation of a function. + They correspond to the physical path to the function. + The more the segments are different, the more the functions are isolated. + +Enumeration and hotplug +======================= + +The PCI address consists of four parts: domain, bus, device and function, +and is of this form: DDDD:BB:dd.f + +* When not using multi-functions (norid is set, or the firmware does not + support multi-functions): + + - There is only one function per domain. + + - The domain is set from the zPCI function's UID as defined during the + LPAR creation. + +* When using multi-functions (norid parameter is not set), + zPCI functions are addressed differently: + + - There is still only one bus per domain. + + - There can be up to 256 functions per bus. + + - The domain part of the address of all functions for + a multi-Function device is set from the zPCI function's UID as defined + in the LPAR creation for the function zero. + + - New functions will only be ready for use after the function zero + (the function with devfn 0) has been enumerated. diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index fca9c4f5bd9c..8aad08a8b8a5 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -204,15 +204,44 @@ definition of the region is:: __u32 ret_code; } __packed; +This region is always available. + While starting an I/O request, orb_area should be filled with the guest ORB, and scsw_area should be filled with the SCSW of the Virtual Subchannel. irb_area stores the I/O result. -ret_code stores a return code for each access of the region. +ret_code stores a return code for each access of the region. The following +values may occur: + +``0`` + The operation was successful. + +``-EOPNOTSUPP`` + The orb specified transport mode or an unidentified IDAW format, or the + scsw specified a function other than the start function. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests, or an internal error occurred. + +``-EBUSY`` + The subchannel was status pending or busy, or a request is already active. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EACCES`` + The channel path(s) used for the I/O were found to be not operational. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + The orb specified a chain longer than 255 ccws, or an internal error + occurred. -This region is always available. vfio-ccw cmd region ------------------- @@ -231,6 +260,64 @@ This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD. Currently, CLEAR SUBCHANNEL and HALT SUBCHANNEL use this region. +command specifies the command to be issued; ret_code stores a return code +for each access of the region. The following values may occur: + +``0`` + The operation was successful. + +``-ENODEV`` + The device was found to be not operational. + +``-EINVAL`` + A command other than halt or clear was specified. + +``-EIO`` + A request was issued while the device was not in a state ready to accept + requests. + +``-EAGAIN`` + A request was being processed, and the caller should retry. + +``-EBUSY`` + The subchannel was status pending or busy while processing a halt request. + +vfio-ccw schib region +--------------------- + +The vfio-ccw schib region is used to return Subchannel-Information +Block (SCHIB) data to userspace:: + + struct ccw_schib_region { + #define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_SCHIB. + +Reading this region triggers a STORE SUBCHANNEL to be issued to the +associated hardware. + +vfio-ccw crw region +--------------------- + +The vfio-ccw crw region is used to return Channel Report Word (CRW) +data to userspace:: + + struct ccw_crw_region { + __u32 crw; + __u32 pad; + } __packed; + +This region is exposed via region type VFIO_REGION_SUBTYPE_CCW_CRW. + +Reading this region returns a CRW if one that is relevant for this +subchannel (e.g. one reporting changes in channel path state) is +pending, or all zeroes if not. If multiple CRWs are pending (including +possibly chained CRWs), reading this region again will return the next +one, until no more CRWs are pending and zeroes are returned. This is +similar to how STORE CHANNEL REPORT WORD works. + vfio-ccw operation details -------------------------- @@ -333,7 +420,14 @@ through DASD/ECKD device online in a guest now and use it as a block device. The current code allows the guest to start channel programs via -START SUBCHANNEL, and to issue HALT SUBCHANNEL and CLEAR SUBCHANNEL. +START SUBCHANNEL, and to issue HALT SUBCHANNEL, CLEAR SUBCHANNEL, +and STORE SUBCHANNEL. + +Currently all channel programs are prefetched, regardless of the +p-bit setting in the ORB. As a result, self modifying channel +programs are not supported. For this reason, IPL has to be handled as +a special case by a userspace/guest program; this has been implemented +in QEMU's s390-ccw bios as of QEMU 4.1. vfio-ccw supports classic (command mode) channel I/O only. Transport mode (HPF) is not supported. diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst index 54e8e7caf7e7..a61de7aa8778 100644 --- a/Documentation/s390/zfcpdump.rst +++ b/Documentation/s390/zfcpdump.rst @@ -46,5 +46,5 @@ initramfs with a user space application that writes the dump to a SCSI partition. For more information on how to use zfcpdump refer to the s390 'Using the Dump -Tools book', which is available from -http://www.ibm.com/developerworks/linux/linux390. +Tools' book, which is available from IBM Knowledge Center: +https://www.ibm.com/support/knowledgecenter/linuxonibm/liaaf/lnz_r_dt.html diff --git a/Documentation/trace/histogram-design.rst b/Documentation/trace/histogram-design.rst new file mode 100644 index 000000000000..eef840043da9 --- /dev/null +++ b/Documentation/trace/histogram-design.rst @@ -0,0 +1,2115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Histogram Design Notes +====================== + +:Author: Tom Zanussi + +This document attempts to provide a description of how the ftrace +histograms work and how the individual pieces map to the data +structures used to implement them in trace_events_hist.c and +tracing_map.c. + +Note: All the ftrace histogram command examples assume the working + directory is the ftrace /tracing directory. For example:: + + # cd /sys/kernel/debug/tracing + +Also, the histogram output displayed for those commands will be +generally be truncated - only enough to make the point is displayed. + +'hist_debug' trace event files +============================== + +If the kernel is compiled with CONFIG_HIST_TRIGGERS_DEBUG set, an +event file named 'hist_debug' will appear in each event's +subdirectory. This file can be read at any time and will display some +of the hist trigger internals described in this document. Specific +examples and output will be described in test cases below. + +Basic histograms +================ + +First, basic histograms. Below is pretty much the simplest thing you +can do with histograms - create one with a single key on a single +event and cat the output:: + + # echo 'hist:keys=pid' >> events/sched/sched_waking/trigger + + # cat events/sched/sched_waking/hist + + { pid: 18249 } hitcount: 1 + { pid: 13399 } hitcount: 1 + { pid: 17973 } hitcount: 1 + { pid: 12572 } hitcount: 1 + ... + { pid: 10 } hitcount: 921 + { pid: 18255 } hitcount: 1444 + { pid: 25526 } hitcount: 2055 + { pid: 5257 } hitcount: 2055 + { pid: 27367 } hitcount: 2055 + { pid: 1728 } hitcount: 2161 + + Totals: + Hits: 21305 + Entries: 183 + Dropped: 0 + +What this does is create a histogram on the sched_waking event using +pid as a key and with a single value, hitcount, which even if not +explicitly specified, exists for every histogram regardless. + +The hitcount value is a per-bucket value that's automatically +incremented on every hit for the given key, which in this case is the +pid. + +So in this histogram, there's a separate bucket for each pid, and each +bucket contains a value for that bucket, counting the number of times +sched_waking was called for that pid. + +Each histogram is represented by a hist_data struct. + +To keep track of each key and value field in the histogram, hist_data +keeps an array of these fields named fields[]. The fields[] array is +an array containing struct hist_field representations of each +histogram val and key in the histogram (variables are also included +here, but are discussed later). So for the above histogram we have one +key and one value; in this case the one value is the hitcount value, +which all histograms have, regardless of whether they define that +value or not, which the above histogram does not. + +Each struct hist_field contains a pointer to the ftrace_event_field +from the event's trace_event_file along with various bits related to +that such as the size, offset, type, and a hist_field_fn_t function, +which is used to grab the field's data from the ftrace event buffer +(in most cases - some hist_fields such as hitcount don't directly map +to an event field in the trace buffer - in these cases the function +implementation gets its value from somewhere else). The flags field +indicates which type of field it is - key, value, variable, variable +reference, etc., with value being the default. + +The other important hist_data data structure in addition to the +fields[] array is the tracing_map instance created for the histogram, +which is held in the .map member. The tracing_map implements the +lock-free hash table used to implement histograms (see +kernel/trace/tracing_map.h for much more discussion about the +low-level data structures implementing the tracing_map). For the +purposes of this discussion, the tracing_map contains a number of +buckets, each bucket corresponding to a particular tracing_map_elt +object hashed by a given histogram key. + +Below is a diagram the first part of which describes the hist_data and +associated key and value fields for the histogram described above. As +you can see, there are two fields in the fields array, one val field +for the hitcount and one key field for the pid key. + +Below that is a diagram of a run-time snapshot of what the tracing_map +might look like for a given run. It attempts to show the +relationships between the hist_data fields and the tracing_map +elements for a couple hypothetical keys and values.:: + + +------------------+ + | hist_data | + +------------------+ +----------------+ + | .fields[] |---->| val = hitcount |----------------------------+ + +----------------+ +----------------+ | + | .map | | .size | | + +----------------+ +--------------+ | + | .offset | | + +--------------+ | + | .fn() | | + +--------------+ | + . | + . | + . | + +----------------+ <--- n_vals | + | key = pid |----------------------------|--+ + +----------------+ | | + | .size | | | + +--------------+ | | + | .offset | | | + +--------------+ | | + | .fn() | | | + +----------------+ <--- n_fields | | + | unused | | | + +----------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + | | | | + +--------------+ | | + n_keys = n_fields - n_vals | | + +The hist_data n_vals and n_fields delineate the extent of the fields[] | | +array and separate keys from values for the rest of the code. | | + +Below is a run-time representation of the tracing_map part of the | | +histogram, with pointers from various parts of the fields[] array | | +to corresponding parts of the tracing_map. | | + +The tracing_map consists of an array of tracing_map_entrys and a set | | +of preallocated tracing_map_elts (abbreviated below as map_entry and | | +map_elt). The total number of map_entrys in the hist_data.map array = | | +map->max_elts (actually map->map_size but only max_elts of those are | | +used. This is a property required by the map_insert() algorithm). | | + +If a map_entry is unused, meaning no key has yet hashed into it, its | | +.key value is 0 and its .val pointer is NULL. Once a map_entry has | | +been claimed, the .key value contains the key's hash value and the | | +.val member points to a map_elt containing the full key and an entry | | +for each key or value in the map_elt.fields[] array. There is an | | +entry in the map_elt.fields[] array corresponding to each hist_field | | +in the histogram, and this is where the continually aggregated sums | | +corresponding to each histogram value are kept. | | + +The diagram attempts to show the relationship between the | | +hist_data.fields[] and the map_elt.fields[] with the links drawn | | +between diagrams:: + + +-----------+ | | + | hist_data | | | + +-----------+ | | + | .fields | | | + +---------+ +-----------+ | | + | .map |---->| map_entry | | | + +---------+ +-----------+ | | + | .key |---> 0 | | + +---------+ | | + | .val |---> NULL | | + +-----------+ | | + | map_entry | | | + +-----------+ | | + | .key |---> pid = 999 | | + +---------+ +-----------+ | | + | .val |--->| map_elt | | | + +---------+ +-----------+ | | + . | .key |---> full key * | | + . +---------+ +---------------+ | | + . | .fields |--->| .sum (val) |<-+ | + +-----------+ +---------+ | 2345 | | | + | map_entry | +---------------+ | | + +-----------+ | .offset (key) |<----+ + | .key |---> 0 | 0 | | | + +---------+ +---------------+ | | + | .val |---> NULL . | | + +-----------+ . | | + | map_entry | . | | + +-----------+ +---------------+ | | + | .key | | .sum (val) or | | | + +---------+ +---------+ | .offset (key) | | | + | .val |--->| map_elt | +---------------+ | | + +-----------+ +---------+ | .sum (val) or | | | + | map_entry | | .offset (key) | | | + +-----------+ +---------------+ | | + | .key |---> pid = 4444 | | + +---------+ +-----------+ | | + | .val | | map_elt | | | + +---------+ +-----------+ | | + | .key |---> full key * | | + +---------+ +---------------+ | | + | .fields |--->| .sum (val) |<-+ | + +---------+ | 65523 | | + +---------------+ | + | .offset (key) |<----+ + | 0 | + +---------------+ + . + . + . + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + | .sum (val) or | + | .offset (key) | + +---------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + map_entry = struct tracing_map_entry + map_elt = struct tracing_map_elt + map_elt.fields = struct tracing_map_field + +Whenever a new event occurs and it has a hist trigger associated with +it, event_hist_trigger() is called. event_hist_trigger() first deals +with the key: for each subkey in the key (in the above example, there +is just one subkey corresponding to pid), the hist_field that +represents that subkey is retrieved from hist_data.fields[] and the +hist_field_fn_t fn() associated with that field, along with the +field's size and offset, is used to grab that subkey's data from the +current trace record. + +Once the complete key has been retrieved, it's used to look that key +up in the tracing_map. If there's no tracing_map_elt associated with +that key, an empty one is claimed and inserted in the map for the new +key. In either case, the tracing_map_elt associated with that key is +returned. + +Once a tracing_map_elt available, hist_trigger_elt_update() is called. +As the name implies, this updates the element, which basically means +updating the element's fields. There's a tracing_map_field associated +with each key and value in the histogram, and each of these correspond +to the key and value hist_fields created when the histogram was +created. hist_trigger_elt_update() goes through each value hist_field +and, as for the keys, uses the hist_field's fn() and size and offset +to grab the field's value from the current trace record. Once it has +that value, it simply adds that value to that field's +continually-updated tracing_map_field.sum member. Some hist_field +fn()s, such as for the hitcount, don't actually grab anything from the +trace record (the hitcount fn() just increments the counter sum by 1), +but the idea is the same. + +Once all the values have been updated, hist_trigger_elt_update() is +done and returns. Note that there are also tracing_map_fields for +each subkey in the key, but hist_trigger_elt_update() doesn't look at +them or update anything - those exist only for sorting, which can +happen later. + +Basic histogram test +-------------------- + +This is a good example to try. It produces 3 value fields and 2 key +fields in the output:: + + # echo 'hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +To see the debug data, cat the kmem/kmalloc's 'hist_debug' file. It +will show the trigger info of the histogram it corresponds to, along +with the address of the hist_data associated with the histogram, which +will become useful in later examples. It then displays the number of +total hist_fields associated with the histogram along with a count of +how many of those correspond to keys and how many correspond to values. + +It then goes on to display details for each field, including the +field's flags and the position of each field in the hist_data's +fields[] array, which is useful information for verifying that things +internally appear correct or not, and which again will become even +more useful in further examples:: + + # cat events/kmem/kmalloc/hist_debug + + # event histogram + # + # trigger info: hist:keys=common_pid,call_site.sym:vals=hitcount,bytes_req,bytes_alloc:sort=hitcount:size=2048 [active] + # + + hist_data: 000000005e48c9a5 + + n_vals: 3 + n_keys: 2 + n_fields: 5 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_req + type: size_t + size: 8 + is_signed: 0 + + hist_data->fields[2]: + flags: + VAL: normal u64 value + ftrace_event_field name: bytes_alloc + type: size_t + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: common_pid + type: int + size: 8 + is_signed: 1 + + hist_data->fields[4]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: call_site + type: unsigned long + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=common_pid,call_site.sym:values=bytes_req,bytes_alloc,hitcount' >> events/kmem/kmalloc/trigger + +Variables +========= + +Variables allow data from one hist trigger to be saved by one hist +trigger and retrieved by another hist trigger. For example, a trigger +on the sched_waking event can capture a timestamp for a particular +pid, and later a sched_switch event that switches to that pid event +can grab the timestamp and use it to calculate a time delta between +the two events:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> + events/sched/sched_switch/trigger + +In terms of the histogram data structures, variables are implemented +as another type of hist_field and for a given hist trigger are added +to the hist_data.fields[] array just after all the val fields. To +distinguish them from the existing key and val fields, they're given a +new flag type, HIST_FIELD_FL_VAR (abbreviated FL_VAR) and they also +make use of a new .var.idx field member in struct hist_field, which +maps them to an index in a new map_elt.vars[] array added to the +map_elt specifically designed to store and retrieve variable values. +The diagram below shows those new elements and adds a new variable +entry, ts0, corresponding to the ts0 variable in the sched_waking +trigger above. + +sched_waking histogram +----------------------:: + + +------------------+ + | hist_data |<-------------------------------------------------------+ + +------------------+ +-------------------+ | + | .fields[] |-->| val = hitcount | | + +----------------+ +-------------------+ | + | .map | | .size | | + +----------------+ +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags | | + +-----------------+ | + | .var.idx | | + +-------------------+ | + | var = ts0 | | + +-------------------+ | + | .size | | + +-----------------+ | + | .offset | | + +-----------------+ | + | .fn() | | + +-----------------+ | + | .flags & FL_VAR | | + +-----------------+ | + | .var.idx |----------------------------+-+ | + +-----------------+ | | | + . | | | + . | | | + . | | | + +-------------------+ <--- n_vals | | | + | key = pid | | | | + +-------------------+ | | | + | .size | | | | + +-----------------+ | | | + | .offset | | | | + +-----------------+ | | | + | .fn() | | | | + +-----------------+ | | | + | .flags & FL_KEY | | | | + +-----------------+ | | | + | .var.idx | | | | + +-------------------+ <--- n_fields | | | + | unused | | | | + +-------------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + | | | | | + +-----------------+ | | | + n_keys = n_fields - n_vals | | | + | | | + +This is very similar to the basic case. In the above diagram, we can | | | +see a new .flags member has been added to the struct hist_field | | | +struct, and a new entry added to hist_data.fields representing the ts0 | | | +variable. For a normal val hist_field, .flags is just 0 (modulo | | | +modifier flags), but if the value is defined as a variable, the .flags | | | +contains a set FL_VAR bit. | | | + +As you can see, the ts0 entry's .var.idx member contains the index | | | +into the tracing_map_elts' .vars[] array containing variable values. | | | +This idx is used whenever the value of the variable is set or read. | | | +The map_elt.vars idx assigned to the given variable is assigned and | | | +saved in .var.idx by create_tracing_map_fields() after it calls | | | +tracing_map_add_var(). | | | + +Below is a representation of the histogram at run-time, which | | | +populates the map, along with correspondence to the above hist_data and | | | +hist_field data structures. | | | + +The diagram attempts to show the relationship between the | | | +hist_data.fields[] and the map_elt.fields[] and map_elt.vars[] with | | | +the links drawn between diagrams. For each of the map_elts, you can | | | +see that the .fields[] members point to the .sum or .offset of a key | | | +or val and the .vars[] members point to the value of a variable. The | | | +arrows between the two diagrams show the linkages between those | | | +tracing_map members and the field definitions in the corresponding | | | +hist_data fields[] members.:: + + +-----------+ | | | + | hist_data | | | | + +-----------+ | | | + | .fields | | | | + +---------+ +-----------+ | | | + | .map |---->| map_entry | | | | + +---------+ +-----------+ | | | + | .key |---> 0 | | | + +---------+ | | | + | .val |---> NULL | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 999 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + . +---------+ | 2345 | | | | + . +--| .vars | +---------------+ | | | + . | +---------+ | .offset (key) | | | | + . | | 0 | | | | + . | +---------------+ | | | + . | . | | | + . | . | | | + . | . | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | .sum (val) or | | | | + . | | .offset (key) | | | | + . | +---------------+ | | | + . | | | | + . +---------------->+---------------+ | | | + . | ts0 |<--+ | | + . | 113345679876 | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . . | | | + . . | | | + . . | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | unused | | | | + . | | | | | + . +---------------+ | | | + . | | | + +-----------+ | | | + | map_entry | | | | + +-----------+ | | | + | .key |---> pid = 4444 | | | + +---------+ +-----------+ | | | + | .val |--->| map_elt | | | | + +---------+ +-----------+ | | | + . | .key |---> full key * | | | + . +---------+ +---------------+ | | | + . | .fields |--->| .sum (val) | | | | + +---------+ | 2345 | | | | + +--| .vars | +---------------+ | | | + | +---------+ | .offset (key) | | | | + | | 0 | | | | + | +---------------+ | | | + | . | | | + | . | | | + | . | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | .sum (val) or | | | | + | | .offset (key) | | | | + | +---------------+ | | | + | | | | + | +---------------+ | | | + +---------------->| ts0 |<--+ | | + | 213499240729 | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + . | | + . | | + . | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + | unused | | | + | | | | + +---------------+ | | + +For each used map entry, there's a map_elt pointing to an array of | | +.vars containing the current value of the variables associated with | | +that histogram entry. So in the above, the timestamp associated with | | +pid 999 is 113345679876, and the timestamp variable in the same | | +.var.idx for pid 4444 is 213499240729. | | + +sched_switch histogram | | +---------------------- | | + +The sched_switch histogram paired with the above sched_waking | | +histogram is shown below. The most important aspect of the | | +sched_switch histogram is that it references a variable on the | | +sched_waking histogram above. | | + +The histogram diagram is very similar to the others so far displayed, | | +but it adds variable references. You can see the normal hitcount and | | +key fields along with a new wakeup_lat variable implemented in the | | +same way as the sched_waking ts0 variable, but in addition there's an | | +entry with the new FL_VAR_REF (short for HIST_FIELD_FL_VAR_REF) flag. | | + +Associated with the new var ref field are a couple of new hist_field | | +members, var.hist_data and var_ref_idx. For a variable reference, the | | +var.hist_data goes with the var.idx, which together uniquely identify | | +a particular variable on a particular histogram. The var_ref_idx is | | +just the index into the var_ref_vals[] array that caches the values of | | +each variable whenever a hist trigger is updated. Those resulting | | +values are then finally accessed by other code such as trace action | | +code that uses the var_ref_idx values to assign param values. | | + +The diagram below describes the situation for the sched_switch | | +histogram referred to before:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> | | + events/sched/sched_switch/trigger | | + | | + +------------------+ | | + | hist_data | | | + +------------------+ +-----------------------+ | | + | .fields[] |-->| val = hitcount | | | + +----------------+ +-----------------------+ | | + | .map | | .size | | | + +----------------+ +---------------------+ | | + +--| .var_refs[] | | .offset | | | + | +----------------+ +---------------------+ | | + | | .fn() | | | + | var_ref_vals[] +---------------------+ | | + | +-------------+ | .flags | | | + | | $ts0 |<---+ +---------------------+ | | + | +-------------+ | | .var.idx | | | + | | | | +---------------------+ | | + | +-------------+ | | .var.hist_data | | | + | | | | +---------------------+ | | + | +-------------+ | | .var_ref_idx | | | + | | | | +-----------------------+ | | + | +-------------+ | | var = wakeup_lat | | | + | . | +-----------------------+ | | + | . | | .size | | | + | . | +---------------------+ | | + | +-------------+ | | .offset | | | + | | | | +---------------------+ | | + | +-------------+ | | .fn() | | | + | | | | +---------------------+ | | + | +-------------+ | | .flags & FL_VAR | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +---------------------+ | | + | | | .var.hist_data | | | + | | +---------------------+ | | + | | | .var_ref_idx | | | + | | +---------------------+ | | + | | . | | + | | . | | + | | . | | + | | +-----------------------+ <--- n_vals | | + | | | key = pid | | | + | | +-----------------------+ | | + | | | .size | | | + | | +---------------------+ | | + | | | .offset | | | + | | +---------------------+ | | + | | | .fn() | | | + | | +---------------------+ | | + | | | .flags | | | + | | +---------------------+ | | + | | | .var.idx | | | + | | +-----------------------+ <--- n_fields | | + | | | unused | | | + | | +-----------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | | | | | + | | +---------------------+ | | + | | n_keys = n_fields - n_vals | | + | | | | + | | | | + | | +-----------------------+ | | + +---------------------->| var_ref = $ts0 | | | + | +-----------------------+ | | + | | .size | | | + | +---------------------+ | | + | | .offset | | | + | +---------------------+ | | + | | .fn() | | | + | +---------------------+ | | + | | .flags & FL_VAR_REF | | | + | +---------------------+ | | + | | .var.idx |--------------------------+ | + | +---------------------+ | + | | .var.hist_data |----------------------------+ + | +---------------------+ + +---| .var_ref_idx | + +---------------------+ + +Abbreviations used in the diagrams:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +When a hist trigger makes use of a variable, a new hist_field is +created with flag HIST_FIELD_FL_VAR_REF. For a VAR_REF field, the +var.idx and var.hist_data take the same values as the referenced +variable, as well as the referenced variable's size, type, and +is_signed values. The VAR_REF field's .name is set to the name of the +variable it references. If a variable reference was created using the +explicit system.event.$var_ref notation, the hist_field's system and +event_name variables are also set. + +So, in order to handle an event for the sched_switch histogram, +because we have a reference to a variable on another histogram, we +need to resolve all variable references first. This is done via the +resolve_var_refs() calls made from event_hist_trigger(). What this +does is grabs the var_refs[] array from the hist_data representing the +sched_switch histogram. For each one of those, the referenced +variable's var.hist_data along with the current key is used to look up +the corresponding tracing_map_elt in that histogram. Once found, the +referenced variable's var.idx is used to look up the variable's value +using tracing_map_read_var(elt, var.idx), which yields the value of +the variable for that element, ts0 in the case above. Note that both +the hist_fields representing both the variable and the variable +reference have the same var.idx, so this is straightforward. + +Variable and variable reference test +------------------------------------ + +This example creates a variable on the sched_waking event, ts0, and +uses it in the sched_switch trigger. The sched_switch trigger also +creates its own variable, wakeup_lat, but nothing yet uses it:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + +Looking at the sched_waking 'hist_debug' output, in addition to the +normal key and value hist_fields, in the val fields section we see a +field with the HIST_FIELD_FL_VAR flag, which indicates that that field +represents a variable. Note that in addition to the variable name, +contained in the var.name field, it includes the var.idx, which is the +index into the tracing_map_elt.vars[] array of the actual variable +location. Note also that the output shows that variables live in the +same part of the hist_data->fields[] array as normal values:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 000000009536f554 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Moving on to the sched_switch trigger hist_debug output, in addition +to the unused wakeup_lat variable, we see a new section displaying +variable references. Variable references are displayed in a separate +section because in addition to to being logically separate from +variables and values, they actually live in a separate hist_data +array, var_refs[]. + +In this example, the sched_switch trigger has a reference to a +variable on the sched_waking trigger, $ts0. Looking at the details, +we can see that the var.hist_data value of the referenced variable +matches the previously displayed sched_waking trigger, and the var.idx +value matches the previously displayed var.idx value for that +variable. Also displayed is the var_ref_idx value for that variable +reference, which is where the value for that variable is cached for +use when the trigger is invoked:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000f4ee8006 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000009536f554 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Actions and Handlers +==================== + +Adding onto the previous example, we will now do something with that +wakeup_lat variable, namely send it and another field as a synthetic +event. + +The onmatch() action below basically says that whenever we have a +sched_switch event, if we have a matching sched_waking event, in this +case if we have a pid in the sched_waking histogram that matches the +the next_pid field on this sched_switch event, we retrieve the +variables specified in the wakeup_latency() trace action, and use +them to generate a new wakeup_latency event into the trace stream. + +Note that the way the trace handlers such as wakeup_latency() (which +could equivalently be written trace(wakeup_latency,$wakeup_lat,next_pid) +are implemented, the parameters specified to the trace handler must be +variables. In this case, $wakeup_lat is obviously a variable, but +next_pid isn't, since it's just naming a field in the sched_switch +trace event. Since this is something that almost every trace() and +save() action does, a special shortcut is implemented to allow field +names to be used directly in those cases. How it works is that under +the covers, a temporary variable is created for the named field, and +this variable is what is actually passed to the trace handler. In the +code and documentation, this type of variable is called a 'field +variable'. + +Fields on other trace event's histograms can be used as well. In that +case we have to generate a new histogram and an unfortunately named +'synthetic_field' (the use of synthetic here has nothing to do with +synthetic events) and use that special histogram field as a variable. + +The diagram below illustrates the new elements described above in the +context of the sched_switch histogram using the onmatch() handler and +the trace() action. + +First, we define the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events + +Next, the sched_waking hist trigger as before:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> + events/sched/sched_waking/trigger + +Finally, we create a hist trigger on the sched_switch event that +generates a wakeup_latency() trace event. In this case we pass +next_pid into the wakeup_latency synthetic event invocation, which +means it will be automatically converted into a field variable:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The diagram for the sched_switch event is similar to previous examples +but shows the additional field_vars[] array for hist_data and shows +the linkages between the field_vars and the variables and references +created to implement the field variables. The details are discussed +below:: + + +------------------+ + | hist_data | + +------------------+ +-----------------------+ + | .fields[] |-->| val = hitcount | + +----------------+ +-----------------------+ + | .map | | .size | + +----------------+ +---------------------+ + +---| .field_vars[] | | .offset | + | +----------------+ +---------------------+ + |+--| .var_refs[] | | .offset | + || +----------------+ +---------------------+ + || | .fn() | + || var_ref_vals[] +---------------------+ + || +-------------+ | .flags | + || | $ts0 |<---+ +---------------------+ + || +-------------+ | | .var.idx | + || | $next_pid |<-+ | +---------------------+ + || +-------------+ | | | .var.hist_data | + ||+>| $wakeup_lat | | | +---------------------+ + ||| +-------------+ | | | .var_ref_idx | + ||| | | | | +-----------------------+ + ||| +-------------+ | | | var = wakeup_lat | + ||| . | | +-----------------------+ + ||| . | | | .size | + ||| . | | +---------------------+ + ||| +-------------+ | | | .offset | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .fn() | + ||| | | | | +---------------------+ + ||| +-------------+ | | | .flags & FL_VAR | + ||| | | +---------------------+ + ||| | | | .var.idx | + ||| | | +---------------------+ + ||| | | | .var.hist_data | + ||| | | +---------------------+ + ||| | | | .var_ref_idx | + ||| | | +---------------------+ + ||| | | . + ||| | | . + ||| | | . + ||| | | . + ||| +--------------+ | | . + +-->| field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +--------------+ | | . + || | field_var | | | . + || +--------------+ | | . + || | var | | | . + || +------------+ | | . + || | val | | | . + || +------------+ | | . + || . | | . + || . | | . + || . | | +-----------------------+ <--- n_vals + || +--------------+ | | | key = pid | + || | field_var | | | +-----------------------+ + || +--------------+ | | | .size | + || | var |--+| +---------------------+ + || +------------+ ||| | .offset | + || | val |-+|| +---------------------+ + || +------------+ ||| | .fn() | + || ||| +---------------------+ + || ||| | .flags | + || ||| +---------------------+ + || ||| | .var.idx | + || ||| +---------------------+ <--- n_fields + || ||| + || ||| n_keys = n_fields - n_vals + || ||| +-----------------------+ + || |+->| var = next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .flags & FL_VAR | + || | | +---------------------+ + || | | | .var.idx | + || | | +---------------------+ + || | | | .var.hist_data | + || | | +-----------------------+ + || +-->| val for next_pid | + || | | +-----------------------+ + || | | | .size | + || | | +---------------------+ + || | | | .offset | + || | | +---------------------+ + || | | | .fn() | + || | | +---------------------+ + || | | | .flags | + || | | +---------------------+ + || | | | | + || | | +---------------------+ + || | | + || | | + || | | +-----------------------+ + +|------------------|-|>| var_ref = $ts0 | + | | | +-----------------------+ + | | | | .size | + | | | +---------------------+ + | | | | .offset | + | | | +---------------------+ + | | | | .fn() | + | | | +---------------------+ + | | | | .flags & FL_VAR_REF | + | | | +---------------------+ + | | +---| .var_ref_idx | + | | +-----------------------+ + | | | var_ref = $next_pid | + | | +-----------------------+ + | | | .size | + | | +---------------------+ + | | | .offset | + | | +---------------------+ + | | | .fn() | + | | +---------------------+ + | | | .flags & FL_VAR_REF | + | | +---------------------+ + | +-----| .var_ref_idx | + | +-----------------------+ + | | var_ref = $wakeup_lat | + | +-----------------------+ + | | .size | + | +---------------------+ + | | .offset | + | +---------------------+ + | | .fn() | + | +---------------------+ + | | .flags & FL_VAR_REF | + | +---------------------+ + +------------------------| .var_ref_idx | + +---------------------+ + +As you can see, for a field variable, two hist_fields are created: one +representing the variable, in this case next_pid, and one to actually +get the value of the field from the trace stream, like a normal val +field does. These are created separately from normal variable +creation and are saved in the hist_data->field_vars[] array. See +below for how these are used. In addition, a reference hist_field is +also created, which is needed to reference the field variables such as +$next_pid variable in the trace() action. + +Note that $wakeup_lat is also a variable reference, referencing the +value of the expression common_timestamp-$ts0, and so also needs to +have a hist field entry representing that reference created. + +When hist_trigger_elt_update() is called to get the normal key and +value fields, it also calls update_field_vars(), which goes through +each field_var created for the histogram, and available from +hist_data->field_vars and calls val->fn() to get the data from the +current trace record, and then uses the var's var.idx to set the +variable at the var.idx offset in the appropriate tracing_map_elt's +variable at elt->vars[var.idx]. + +Once all the variables have been updated, resolve_var_refs() can be +called from event_hist_trigger(), and not only can our $ts0 and +$next_pid references be resolved but the $wakeup_lat reference as +well. At this point, the trace() action can simply access the values +assembled in the var_ref_vals[] array and generate the trace event. + +The same process occurs for the field variables associated with the +save() action. + +Abbreviations used in the diagram:: + + hist_data = struct hist_trigger_data + hist_data.fields = struct hist_field + field_var = struct field_var + fn = hist_field_fn_t + FL_KEY = HIST_FIELD_FL_KEY + FL_VAR = HIST_FIELD_FL_VAR + FL_VAR_REF = HIST_FIELD_FL_VAR_REF + +trace() action field variable test +---------------------------------- + +This example adds to the previous test example by finally making use +of the wakeup_lat variable, but in addition also creates a couple of +field variables that then are all passed to the wakeup_latency() trace +action via the onmatch() handler. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, the sched_waking trigger from previous examples:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Finally, as in the previous test example, we calculate and assign the +wakeup latency using the $ts0 reference from the sched_waking trigger +to the wakeup_lat variable, and finally use it along with a couple +sched_switch event fields, next_pid and next_comm, to generate a +wakeup_latency trace event. The next_pid and next_comm event fields +are automatically converted into field variables for this purpose:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test example:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000d60ff61f + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows the same key and value fields +as in the previous test example - note that wakeup_lat is still in the +val fields section, but that the new field variables are not there - +although the field variables are variables, they're held separately in +the hist_data's field_vars[] array. Although the field variables and +the normal variables are located in separate places, you can see that +the actual variable locations for those variables in the +tracing_map_elt.vars[] do have increasing indices as expected: +wakeup_lat takes the var.idx = 0 slot, while the field variables for +next_pid and next_comm have values var.idx = 1, and var.idx = 2. Note +also that those are the same values displayed for the variable +references corresponding to those variables in the variable reference +fields section. Since there are two triggers and thus two hist_data +addresses, those addresses also need to be accounted for when doing +the matching - you can see that the first variable refers to the 0 +var.idx on the previous hist trigger (see the hist_data address +associated with that trigger), while the second variable refers to the +0 var.idx on the sched_switch hist trigger, as do all the remaining +variable references. + +Finally, the action tracking variables section just shows the system +and event name for the onmatch() handler:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm) [active] + # + + hist_data: 0000000008f551b7 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000d60ff61f + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000008f551b7 + var_ref_idx (into hist_data->var_refs[]): 3 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->field_vars[1]: + + field_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[1].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,next_comm)' >> /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +action_data and the trace() action +---------------------------------- + +As mentioned above, when the trace() action generates a synthetic +event, all the parameters to the synthetic event either already are +variables or are converted into variables (via field variables), and +finally all those variable values are collected via references to them +into a var_ref_vals[] array. + +The values in the var_ref_vals[] array, however, don't necessarily +follow the same ordering as the synthetic event params. To address +that, struct action_data contains another array, var_ref_idx[] that +maps the trace action params to the var_ref_vals[] values. Below is a +diagram illustrating that for the wakeup_latency() synthetic event:: + + +------------------+ wakeup_latency() + | action_data | event params var_ref_vals[] + +------------------+ +-----------------+ +-----------------+ + | .var_ref_idx[] |--->| $wakeup_lat idx |---+ | | + +----------------+ +-----------------+ | +-----------------+ + | .synth_event | | $next_pid idx |---|-+ | $wakeup_lat val | + +----------------+ +-----------------+ | | +-----------------+ + . | +->| $next_pid val | + . | +-----------------+ + . | . + +-----------------+ | . + | | | . + +-----------------+ | +-----------------+ + +--->| $wakeup_lat val | + +-----------------+ + +Basically, how this ends up getting used in the synthetic event probe +function, trace_event_raw_event_synth(), is as follows:: + + for each field i in .synth_event + val_idx = .var_ref_idx[i] + val = var_ref_vals[val_idx] + +action_data and the onXXX() handlers +------------------------------------ + +The hist trigger onXXX() actions other than onmatch(), such as onmax() +and onchange(), also make use of and internally create hidden +variables. This information is contained in the +action_data.track_data struct, and is also visible in the hist_debug +output as will be described in the example below. + +Typically, the onmax() or onchange() handlers are used in conjunction +with the save() and snapshot() actions. For example:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +or:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0: \ + onmax($wakeup_lat).snapshot()' >> + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger + +save() action field variable test +--------------------------------- + +For this example, instead of generating a synthetic event, the save() +action is used to save field values whenever an onmax() handler +detects that a new max latency has been hit. As in the previous +example, the values being saved are also field values, but in this +case, are kept in a separate hist_data array named save_vars[]. + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +In this case, however, we set up the sched_switch trigger to save some +sched_switch field values whenever we hit a new maximum latency. For +both the onmax() handler and save() action, variables will be created, +which we can use the hist_debug files to examine:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + +The sched_waking hist_debug output shows the same data as in the +previous test examples:: + + # cat events/sched/sched_waking/hist_debug + + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000e6290f48 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The output of the sched_switch trigger shows the same val and key +values as before, but also shows a couple new sections. + +First, the action tracking variables section now shows the +actions[].track_data information describing the special tracking +variables and references used to track, in this case, the running +maximum value. The actions[].track_data.var_ref member contains the +reference to the variable being tracked, in this case the $wakeup_lat +variable. In order to perform the onmax() handler function, there +also needs to be a variable that tracks the current maximum by getting +updated whenever a new maximum is hit. In this case, we can see that +an auto-generated variable named ' __max' has been created and is +visible in the actions[].track_data.track_var variable. + +Finally, in the new 'save action variables' section, we can see that +the 4 params to the save() function have resulted in 4 field variables +being created for the purposes of saving the values of the named +fields when the max is hit. These variables are kept in a separate +save_vars[] array off of hist_data, so are displayed in a separate +section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) [active] + # + + hist_data: 0000000057bcd28d + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000e6290f48 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].track_data.var_ref: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000057bcd28d + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->actions[0].track_data.track_var: + flags: + HIST_FIELD_FL_VAR + var.name: __max + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + save action variables (save() params): + + hist_data->save_vars[0]: + + save_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + save_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + hist_data->save_vars[1]: + + save_vars[1].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_pid + var.idx (into tracing_map_elt.vars[]): 3 + + save_vars[1].val: + ftrace_event_field name: prev_pid + type: pid_t + size: 4 + is_signed: 1 + + hist_data->save_vars[2]: + + save_vars[2].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_prio + var.idx (into tracing_map_elt.vars[]): 4 + + save_vars[2].val: + ftrace_event_field name: prev_prio + type: int + size: 4 + is_signed: 1 + + hist_data->save_vars[3]: + + save_vars[3].var: + flags: + HIST_FIELD_FL_VAR + var.name: prev_comm + var.idx (into tracing_map_elt.vars[]): 5 + + save_vars[3].val: + ftrace_event_field name: prev_comm + type: char[16] + size: 256 + is_signed: 0 + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +A couple special cases +====================== + +While the above covers the basics of the histogram internals, there +are a couple of special cases that should be discussed, since they +tend to create even more confusion. Those are field variables on other +histograms, and aliases, both described below through example tests +using the hist_debug files. + +Test of field variables on other histograms +------------------------------------------- + +This example is similar to the previous examples, but in this case, +the sched_switch trigger references a hist trigger field on another +event, namely the sched_waking event. In order to accomplish this, a +field variable is created for the other event, but since an existing +histogram can't be used, as existing histograms are immutable, a new +histogram with a matching variable is created and used, and we'll see +that reflected in the hist_debug output shown below. + +First, we create the wakeup_latency synthetic event. Note the +addition of the prio field:: + + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +As in previous test examples, we set up the sched_waking trigger:: + + # echo 'hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +Here we set up a hist trigger on sched_switch to send a wakeup_latency +event using an onmatch handler naming the sched_waking event. Note +that the third param being passed to the wakeup_latency() is prio, +which is a field name that needs to have a field variable created for +it. There isn't however any prio field on the sched_switch event so +it would seem that it wouldn't be possible to create a field variable +for it. The matching sched_waking event does have a prio field, so it +should be possible to make use of it for this purpose. The problem +with that is that it's not currently possible to define a new variable +on an existing histogram, so it's not possible to add a new prio field +variable to the existing sched_waking histogram. It is however +possible to create an additional new 'matching' sched_waking histogram +for the same event, meaning that it uses the same key and filters, and +define the new prio field variable on that. + +Here's the sched_switch trigger:: + + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + +And here's the output of the hist_debug information for the +sched_waking hist trigger. Note that there are two histograms +displayed in the output: the first is the normal sched_waking +histogram we've seen in the previous examples, and the second is the +special histogram we created to provide the prio field variable. + +Looking at the second histogram below, we see a variable with the name +synthetic_prio. This is the field variable created for the prio field +on that sched_waking histogram:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000349570e4 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:synthetic_prio=prio:sort=hitcount:size=2048 [active] + # + + hist_data: 000000006920cf38 + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: prio + var.name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + type: int + size: 4 + is_signed: 1 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +Looking at the sched_switch histogram below, we can see a reference to +the synthetic_prio variable on sched_waking, and looking at the +associated hist_data address we see that it is indeed associated with +the new histogram. Note also that the other references are to a +normal variable, wakeup_lat, and to a normal field variable, next_pid, +the details of which are in the field variables section:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio) [active] + # + + hist_data: 00000000a73b67df + + n_vals: 2 + n_keys: 1 + n_fields: 3 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000349570e4 + var_ref_idx (into hist_data->var_refs[]): 0 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a73b67df + var_ref_idx (into hist_data->var_refs[]): 2 + type: pid_t + size: 4 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: synthetic_prio + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 000000006920cf38 + var_ref_idx (into hist_data->var_refs[]): 3 + type: int + size: 4 + is_signed: 1 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_pid + var.idx (into tracing_map_elt.vars[]): 1 + + field_vars[0].val: + ftrace_event_field name: next_pid + type: pid_t + size: 4 + is_signed: 1 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,prio)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; int prio' >> synthetic_events + +Alias test +---------- + +This example is very similar to previous examples, but demonstrates +the alias flag. + +First, we create the wakeup_latency synthetic event:: + + # echo 'wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events + +Next, we create a sched_waking trigger similar to previous examples, +but in this case we save the pid in the waking_pid variable:: + + # echo 'hist:keys=pid:waking_pid=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + +For the sched_switch trigger, instead of using $waking_pid directly in +the wakeup_latency synthetic event invocation, we create an alias of +$waking_pid named $woken_pid, and use that in the synthetic event +invocation instead:: + + # echo 'hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + +Looking at the sched_waking hist_debug output, in addition to the +normal fields, we can see the waking_pid variable:: + + # cat events/sched/sched_waking/hist_debug + + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount:waking_pid=pid,ts0=common_timestamp.usecs:sort=hitcount:size=2048:clock=global [active] + # + + hist_data: 00000000a250528c + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + ftrace_event_field name: pid + var.name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 8 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: pid + type: pid_t + size: 8 + is_signed: 1 + +The sched_switch hist_debug output shows that a variable named +woken_pid has been created but that it also has the +HIST_FIELD_FL_ALIAS flag set. It also has the HIST_FIELD_FL_VAR flag +set, which is why it appears in the val field section. + +Despite that implementation detail, an alias variable is actually more +like a variable reference; in fact it can be thought of as a reference +to a reference. The implementation copies the var_ref->fn() from the +variable reference being referenced, in this case, the waking_pid +fn(), which is hist_field_var_ref() and makes that the fn() of the +alias. The hist_field_var_ref() fn() requires the var_ref_idx of the +variable reference it's using, so waking_pid's var_ref_idx is also +copied to the alias. The end result is that when the value of alias +is retrieved, in the end it just does the same thing the original +reference would have done and retrieves the same value from the +var_ref_vals[] array. You can verify this in the output by noting +that the var_ref_idx of the alias, in this case woken_pid, is the same +as the var_ref_idx of the reference, waking_pid, in the variable +reference fields section. + +Additionally, once it gets that value, since it is also a variable, it +then saves that value into its var.idx. So the var.idx of the +woken_pid alias is 0, which it fills with the value from var_ref_idx 0 +when its fn() is called to update itself. You'll also notice that +there's a woken_pid var_ref in the variable refs section. That is the +reference to the woken_pid alias variable, and you can see that it +retrieves the value from the same var.idx as the woken_pid alias, 0, +and then in turn saves that value in its own var_ref_idx slot, 3, and +the value at this position is finally what gets assigned to the +$woken_pid slot in the trace event invocation:: + + # cat events/sched/sched_switch/hist_debug + + # event histogram + # + # trigger info: hist:keys=next_pid:vals=hitcount:woken_pid=$waking_pid,wakeup_lat=common_timestamp.usecs-$ts0:sort=hitcount:size=2048:clock=global:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm) [active] + # + + hist_data: 0000000055d65ed0 + + n_vals: 3 + n_keys: 1 + n_fields: 4 + + val fields: + + hist_data->fields[0]: + flags: + VAL: HIST_FIELD_FL_HITCOUNT + type: u64 + size: 8 + is_signed: 0 + + hist_data->fields[1]: + flags: + HIST_FIELD_FL_VAR + HIST_FIELD_FL_ALIAS + var.name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->fields[2]: + flags: + HIST_FIELD_FL_VAR + var.name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + type: u64 + size: 0 + is_signed: 0 + + key fields: + + hist_data->fields[3]: + flags: + HIST_FIELD_FL_KEY + ftrace_event_field name: next_pid + type: pid_t + size: 8 + is_signed: 1 + + variable reference fields: + + hist_data->var_refs[0]: + flags: + HIST_FIELD_FL_VAR_REF + name: waking_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 0 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[1]: + flags: + HIST_FIELD_FL_VAR_REF + name: ts0 + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 00000000a250528c + var_ref_idx (into hist_data->var_refs[]): 1 + type: u64 + size: 8 + is_signed: 0 + + hist_data->var_refs[2]: + flags: + HIST_FIELD_FL_VAR_REF + name: wakeup_lat + var.idx (into tracing_map_elt.vars[]): 1 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 2 + type: u64 + size: 0 + is_signed: 0 + + hist_data->var_refs[3]: + flags: + HIST_FIELD_FL_VAR_REF + name: woken_pid + var.idx (into tracing_map_elt.vars[]): 0 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 3 + type: pid_t + size: 4 + is_signed: 1 + + hist_data->var_refs[4]: + flags: + HIST_FIELD_FL_VAR_REF + name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + var.hist_data: 0000000055d65ed0 + var_ref_idx (into hist_data->var_refs[]): 4 + type: char[16] + size: 256 + is_signed: 0 + + field variables: + + hist_data->field_vars[0]: + + field_vars[0].var: + flags: + HIST_FIELD_FL_VAR + var.name: next_comm + var.idx (into tracing_map_elt.vars[]): 2 + + field_vars[0].val: + ftrace_event_field name: next_comm + type: char[16] + size: 256 + is_signed: 0 + + action tracking variables (for onmax()/onchange()/onmatch()): + + hist_data->actions[0].match_data.event_system: sched + hist_data->actions[0].match_data.event: sched_waking + +The commands below can be used to clean things up for the next test:: + + # echo '!hist:keys=next_pid:woken_pid=$waking_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,$woken_pid,next_comm)' >> events/sched/sched_switch/trigger + + # echo '!hist:keys=pid:ts0=common_timestamp.usecs' >> events/sched/sched_waking/trigger + + # echo '!wakeup_latency u64 lat; pid_t pid; char comm[16]' >> synthetic_events diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 561969754bc0..6f9e000757fa 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -191,15 +191,15 @@ The usage pattern is:: again: range.notifier_seq = mmu_interval_read_begin(&interval_sub); - down_read(&mm->mmap_sem); + mmap_read_lock(mm); ret = hmm_range_fault(&range); if (ret) { - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (ret == -EBUSY) goto again; return ret; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); take_lock(driver->update); if (mmu_interval_read_retry(&ni, range.notifier_seq) { diff --git a/Documentation/vm/transhuge.rst b/Documentation/vm/transhuge.rst index 37c57ca32629..0ed23e59abe5 100644 --- a/Documentation/vm/transhuge.rst +++ b/Documentation/vm/transhuge.rst @@ -98,9 +98,9 @@ split_huge_page() or split_huge_pmd() has a cost. To make pagetable walks huge pmd aware, all you need to do is to call pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the -mmap_sem in read (or write) mode to be sure a huge pmd cannot be +mmap_lock in read (or write) mode to be sure a huge pmd cannot be created from under you by khugepaged (khugepaged collapse_huge_page -takes the mmap_sem in write mode in addition to the anon_vma lock). If +takes the mmap_lock in write mode in addition to the anon_vma lock). If pmd_trans_huge returns false, you just fallback in the old code paths. If instead pmd_trans_huge returns true, you have to take the page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the diff --git a/MAINTAINERS b/MAINTAINERS index 5e7c42fecb2a..456162183074 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7251,7 +7251,7 @@ L: cluster-devel@redhat.com S: Supported W: http://sources.redhat.com/cluster/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git -F: Documentation/filesystems/gfs2*.txt +F: Documentation/filesystems/gfs2* F: fs/gfs2/ F: include/uapi/linux/gfs2_ondisk.h @@ -8502,6 +8502,7 @@ F: drivers/mtd/nand/raw/ingenic/ F: drivers/pinctrl/pinctrl-ingenic.c F: drivers/power/supply/ingenic-battery.c F: drivers/pwm/pwm-jz4740.c +F: drivers/remoteproc/ingenic_rproc.c F: drivers/rtc/rtc-jz4740.c F: drivers/tty/serial/8250/8250_ingenic.c F: drivers/usb/musb/jz4740.c @@ -14840,6 +14841,7 @@ S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ F: arch/s390/pci/ F: drivers/pci/hotplug/s390_pci_hpc.c +F: Documentation/s390/pci.rst S390 VFIO AP DRIVER M: Tony Krowiak diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f4858..00266e6e1b71 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344b..43af71835adf 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed8610970..e5347a080008 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 89128489cb59..9945ff483eaf 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -4,19 +4,6 @@ #include -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* Note that the following two definitions are _highly_ dependent on the contexts in which they are used in the kernel. I personally think it is criminal how loosely defined these macros are. */ @@ -48,7 +35,7 @@ extern void smp_imb(void); extern void __load_new_mm_context(struct mm_struct *); static inline void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (vma->vm_flags & VM_EXEC) { @@ -59,20 +46,17 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#else -extern void flush_icache_user_range(struct vm_area_struct *vma, +#define flush_icache_user_page flush_icache_user_page +#else /* CONFIG_SMP */ +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#endif +#define flush_icache_user_page flush_icache_user_page +#endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_page((vma), (page), 0, 0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ALPHA_CACHEFLUSH_H */ diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5..13bea465f1c0 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 0267aa8a4f86..162c17b2631f 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -276,15 +276,6 @@ extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_FOW; return extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; } extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; } -#define PAGE_DIR_OFFSET(tsk,address) pgd_offset((tsk),(address)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) - -/* to find an entry in a page-table-directory. */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - /* * The smp_read_barrier_depends() in the following functions are required to * order the load of *dir (the pointer in the top level page table) with any @@ -305,6 +296,7 @@ extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address) smp_read_barrier_depends(); /* see above */ return ret; } +#define pmd_offset pmd_offset /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) @@ -314,9 +306,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) smp_read_barrier_depends(); /* see above */ return ret; } - -#define pte_offset_map(dir,addr) pte_offset_kernel((dir),(addr)) -#define pte_unmap(pte) do { } while (0) +#define pte_offset_kernel pte_offset_kernel extern pgd_t swapper_pg_dir[1024]; @@ -355,8 +345,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) extern void paging_init(void); -#include - /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a..b45f0b0d6511 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h index f1fce942fddc..701a05090141 100644 --- a/arch/alpha/kernel/proto.h +++ b/arch/alpha/kernel/proto.h @@ -2,8 +2,6 @@ #include #include -#include - /* Prototypes of functions used across modules here in this directory. */ #define vucp volatile unsigned char * diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d6..8c43212ae38e 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b4..f5c42a8fcf9c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20..631cc17410d1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include @@ -740,7 +739,7 @@ ipi_flush_icache_page(void *x) } void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65..e063b3857b3d 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f..47459b73cdb7 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d33508621820..9fb445d7dca5 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb987..3c43fd347526 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141..bf99dcfd40c4 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77..0a2ab6cb18db 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1..83d6c53d6d4d 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225..e1bee8f84c58 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1..7690dfd57cb6 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b..53adf43dcd44 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765..47f3ce4f719a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b5..b5846ffdadce 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d33074011960..4b1c8d85c4f0 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa..94046f9aea08 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d8..930005b2f630 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47ad..7c420d8dac53 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c8..dd9de84b630c 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c6864088..9e2adb69bc74 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de140..b1f3b4fcf99b 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fd..2c54d707142a 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index f6b9664ac504..49754e07e04f 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -121,10 +121,10 @@ dik_show_code(unsigned int *pc) } static void -dik_show_trace(unsigned long *sp) +dik_show_trace(unsigned long *sp, const char *loglvl) { long i = 0; - printk("Trace:\n"); + printk("%sTrace:\n", loglvl); while (0x1ff8 & (unsigned long) sp) { extern char _stext[], _etext[]; unsigned long tmp = *sp; @@ -133,24 +133,24 @@ dik_show_trace(unsigned long *sp) continue; if (tmp >= (unsigned long) &_etext) continue; - printk("[<%lx>] %pSR\n", tmp, (void *)tmp); + printk("%s[<%lx>] %pSR\n", loglvl, tmp, (void *)tmp); if (i > 40) { - printk(" ..."); + printk("%s ...", loglvl); break; } } - printk("\n"); + printk("%s\n", loglvl); } static int kstack_depth_to_print = 24; -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { unsigned long *stack; int i; /* - * debugging aid: "show_stack(NULL);" prints the + * debugging aid: "show_stack(NULL, NULL, KERN_EMERG);" prints the * back trace for this cpu. */ if(sp==NULL) @@ -163,14 +163,14 @@ void show_stack(struct task_struct *task, unsigned long *sp) if ((i % 4) == 0) { if (i) pr_cont("\n"); - printk(" "); + printk("%s ", loglvl); } else { pr_cont(" "); } pr_cont("%016lx", *stack++); } pr_cont("\n"); - dik_show_trace(sp); + dik_show_trace(sp, loglvl); } void @@ -184,7 +184,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) printk("%s(%d): %s %ld\n", current->comm, task_pid_nr(current), str, err); dik_show_regs(regs, r9_15); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); dik_show_code((unsigned int *)regs->pc); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { @@ -625,7 +625,7 @@ got_exception: printk("gp = %016lx sp = %p\n", regs->gp, regs+1); dik_show_code((unsigned int *)pc); - dik_show_trace((unsigned long *)(regs+1)); + dik_show_trace((unsigned long *)(regs+1), KERN_DEFAULT); if (test_and_set_thread_flag (TIF_DIE_IF_KERNEL)) { printk("die_if_kernel recursion detected.\n"); @@ -957,12 +957,12 @@ give_sigsegv: si_code = SEGV_ACCERR; else { struct mm_struct *mm = current->mm; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); if (find_vma(mm, (unsigned long)va)) si_code = SEGV_ACCERR; else si_code = SEGV_MAPERR; - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); } send_sig_fault(SIGSEGV, si_code, va, 0, current); return; diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index c2d7b6d7bac7..c2303a8c2b9f 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -117,7 +117,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (user_mode(regs)) flags |= FAULT_FLAG_USER; retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); if (!vma) goto bad_area; @@ -171,7 +171,7 @@ retry: if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; - /* No need to up_read(&mm->mmap_sem) as we would + /* No need to mmap_read_unlock(mm) as we would * have already released it in __lock_page_or_retry * in mm/filemap.c. */ @@ -180,14 +180,14 @@ retry: } } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return; /* Something tried to access memory that isn't in our memory map. Fix it, but check if it's kernel or user first. */ bad_area: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (user_mode(regs)) goto do_sigsegv; @@ -211,14 +211,14 @@ retry: /* We ran out of memory, or some other thing happened to us that made us unable to handle the page fault gracefully. */ out_of_memory: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); return; do_sigbus: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* Send a sigbus, regardless of whether we were in kernel or user mode. */ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0); diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5..3c42b3147fd6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 0be19fd1a412..4c453ba96c51 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -13,7 +13,8 @@ struct task_struct; void show_regs(struct pt_regs *regs); -void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs); +void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl); void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 12be7e1b7cc0..f1ed17edb085 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -248,9 +248,6 @@ extern char empty_zero_page[PAGE_SIZE]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) - #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) @@ -282,18 +279,6 @@ static inline void pmd_set(pmd_t *pmdp, pte_t *ptep) /* Don't use virt_to_pfn for macros below: could cause truncations for PAE40*/ #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) -#define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -/* - * pte_offset gets a @ptr to PMD entry (PGD in our 2-tier paging system) - * and returns ptr to PTE entry corresponding to @addr - */ -#define pte_offset(dir, addr) ((pte_t *)(pmd_page_vaddr(*dir)) +\ - __pte_index(addr)) - -/* No mapping of Page Tables in high mem etc, so following same as above */ -#define pte_offset_kernel(dir, addr) pte_offset(dir, addr) -#define pte_offset_map(dir, addr) pte_offset(dir, addr) /* Zoo of pte_xxx function */ #define pte_read(pte) (pte_val(pte) & _PAGE_READ) @@ -331,13 +316,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pteval); } -/* - * All kernel related VM pages are in init's mm. - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) -#define pgd_offset(mm, addr) (((mm)->pgd)+pgd_index(addr)) - /* * Macro to quickly access the PGD entry, utlising the fact that some * arch may cache the pointer to Page Directory of "current" task @@ -390,8 +368,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #include #endif -#include - /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 315528f04bc1..8c8e5172fecd 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -90,10 +90,10 @@ fault: if (unlikely(ret != -EFAULT)) goto fail; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); ret = fixup_user_fault(current, current->mm, (unsigned long) uaddr, FAULT_FLAG_WRITE, NULL); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (likely(!ret)) goto again; diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index 1e440bbfa876..feba91c9d969 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -158,9 +158,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, /* Call-back which plugs into unwinding core to dump the stack in * case of panic/OOPs/BUG etc */ -static int __print_sym(unsigned int address, void *unused) +static int __print_sym(unsigned int address, void *arg) { - printk(" %pS\n", (void *)address); + const char *loglvl = arg; + + printk("%s %pS\n", loglvl, (void *)address); return 0; } @@ -217,17 +219,18 @@ static int __get_first_nonsched(unsigned int address, void *unused) *------------------------------------------------------------------------- */ -noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs) +noinline void show_stacktrace(struct task_struct *tsk, struct pt_regs *regs, + const char *loglvl) { - pr_info("\nStack Trace:\n"); - arc_unwind_core(tsk, regs, __print_sym, NULL); + printk("%s\nStack Trace:\n", loglvl); + arc_unwind_core(tsk, regs, __print_sym, (void *)loglvl); } EXPORT_SYMBOL(show_stacktrace); /* Expected by sched Code */ -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - show_stacktrace(tsk, NULL); + show_stacktrace(tsk, NULL, loglvl); } /* Another API expected by schedular, shows up in "ps" as Wait Channel diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 3393558876a9..28e8bf04b253 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -89,7 +89,7 @@ static void show_faulting_vma(unsigned long address) /* can't use print_vma_addr() yet as it doesn't check for * non-inclusive vma */ - down_read(&active_mm->mmap_sem); + mmap_read_lock(active_mm); vma = find_vma(active_mm, address); /* check against the find_vma( ) behaviour which returns the next VMA @@ -111,7 +111,7 @@ static void show_faulting_vma(unsigned long address) } else pr_info(" @No matching VMA found\n"); - up_read(&active_mm->mmap_sem); + mmap_read_unlock(active_mm); } static void show_ecr_verbose(struct pt_regs *regs) @@ -240,5 +240,5 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, /* Show stack trace if this Fatality happened in kernel mode */ if (!user_mode(regs)) - show_stacktrace(current, regs); + show_stacktrace(current, regs, KERN_DEFAULT); } diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 92b339c7adba..72f5405a7ec5 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -107,7 +107,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; retry: - down_read(&mm->mmap_sem); + mmap_read_lock(mm); vma = find_vma(mm, address); if (!vma) @@ -141,7 +141,7 @@ retry: } /* - * Fault retry nuances, mmap_sem already relinquished by core mm + * Fault retry nuances, mmap_lock already relinquished by core mm */ if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { @@ -150,7 +150,7 @@ retry: } bad_area: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Major/minor page fault accounting diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 479b0d72d3cf..1b9f473c6369 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -6,8 +6,8 @@ #include #include #include +#include #include -#include #include #include @@ -92,17 +92,9 @@ EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { - pgd_t *pgd_k; - p4d_t *p4d_k; - pud_t *pud_k; - pmd_t *pmd_k; + pmd_t *pmd_k = pmd_off_k(kvaddr); pte_t *pte_k; - pgd_k = pgd_offset_k(kvaddr); - p4d_k = p4d_offset(pgd_k, kvaddr); - pud_k = pud_offset(p4d_k, kvaddr); - pmd_k = pmd_offset(pud_k, kvaddr); - pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pte_k) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", diff --git a/arch/arc/mm/tlbex.S b/arch/arc/mm/tlbex.S index 2efaf6ca0c06..31f54bdd95f2 100644 --- a/arch/arc/mm/tlbex.S +++ b/arch/arc/mm/tlbex.S @@ -33,9 +33,9 @@ */ #include +#include #include #include -#include #include #include #include diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index deef4d0cb3b5..673c7dd75ab9 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -82,7 +82,8 @@ void hook_ifault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, int code, const char *name); -extern asmlinkage void c_backtrace(unsigned long fp, int pmode); +extern asmlinkage void c_backtrace(unsigned long fp, int pmode, + const char *loglvl); struct mm_struct; void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 7114b9aa46b8..2e24e765e6d3 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -258,11 +258,11 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the + * flush_icache_user_range is used when we want to ensure that the * Harvard caches are synchronised for the user space address range. * This is used for the ARM private sys_cacheflush system call. */ -#define flush_cache_user_range(s,e) __cpuc_coherent_user_range(s,e) +#define flush_icache_user_range(s,e) __cpuc_coherent_user_range(s,e) /* * Perform necessary cache operations to ensure that data previously @@ -318,9 +318,6 @@ extern void flush_kernel_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 9383f236e795..84dc0ba822f5 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -13,7 +13,6 @@ #include #include #include -#include #include #ifdef CONFIG_EFI diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index 472c93db5dac..fc56fc3e1931 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -6,8 +6,8 @@ #define FIXADDR_END 0xfff00000UL #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) +#include #include -#include enum fixed_addresses { FIX_EARLYCON_MEM_BASE, diff --git a/arch/arm/include/asm/idmap.h b/arch/arm/include/asm/idmap.h index 73ba956e379f..aab7e8358e6a 100644 --- a/arch/arm/include/asm/idmap.h +++ b/arch/arm/include/asm/idmap.h @@ -3,7 +3,7 @@ #define __ASM_IDMAP_H #include -#include +#include /* Tag a function as requiring to be executed via an identity mapping. */ #define __idmap __section(.idmap.text) noinline notrace diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 9e084a464a97..3502c2f746ca 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -187,6 +187,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) { return (pmd_t *)pud; } +#define pmd_offset pmd_offset #define pmd_large(pmd) (pmd_val(pmd) & 2) #define pmd_leaf(pmd) (pmd_val(pmd) & 2) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1933aed9f68d..fbb6693c3352 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -133,13 +133,6 @@ static inline pmd_t *pud_page_vaddr(pud_t pud) return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); } -/* Find an entry in the second-level page table.. */ -#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) -{ - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); -} - #define pmd_bad(pmd) (!(pmd_val(pmd) & 2)) #define copy_pmd(pmdpd,pmdps) \ diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index 30fb2330f57b..d16aba48fa0a 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -22,7 +22,6 @@ #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) /* FIXME */ /* * PMD_SHIFT determines the size of the area a second-level page table can map @@ -73,8 +72,6 @@ extern unsigned int kobjsize(const void *objp); #define FIRST_USER_ADDRESS 0UL -#include - #else /* diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index fba20607c53c..c02f24400369 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -166,14 +166,6 @@ extern struct page *empty_zero_page; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; -/* to find an entry in a page-table-directory */ -#define pgd_index(addr) ((addr) >> PGDIR_SHIFT) - -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(addr) pgd_offset(&init_mm, addr) - #define pmd_none(pmd) (!pmd_val(pmd)) static inline pte_t *pmd_page_vaddr(pmd_t pmd) @@ -183,21 +175,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) -#ifndef CONFIG_HIGHPTE -#define __pte_map(pmd) pmd_page_vaddr(*(pmd)) -#define __pte_unmap(pte) do { } while (0) -#else -#define __pte_map(pmd) (pte_t *)kmap_atomic(pmd_page(*(pmd))) -#define __pte_unmap(pte) kunmap_atomic(pte) -#endif - -#define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) - -#define pte_offset_kernel(pmd,addr) (pmd_page_vaddr(*(pmd)) + pte_index(addr)) - -#define pte_offset_map(pmd,addr) (__pte_map(pmd) + pte_index(addr)) -#define pte_unmap(pte) __pte_unmap(pte) - #define pte_pfn(pte) ((pte_val(pte) & PHYS_MASK) >> PAGE_SHIFT) #define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot)) @@ -339,8 +316,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* FIXME: this is not correct */ #define kern_addr_valid(addr) (1) -#include - /* * We provide our own arch_get_unmapped_area to cope with VIPT caches. */ diff --git a/arch/arm/include/asm/traps.h b/arch/arm/include/asm/traps.h index 172b08ff3760..987fefb0a4db 100644 --- a/arch/arm/include/asm/traps.h +++ b/arch/arm/include/asm/traps.h @@ -29,7 +29,8 @@ static inline int __in_irqentry_text(unsigned long ptr) } extern void __init early_trap_init(void *); -extern void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame); +extern void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl); extern void ptrace_break(struct pt_regs *regs); extern void *vectors_page; diff --git a/arch/arm/include/asm/unwind.h b/arch/arm/include/asm/unwind.h index 6e282c33126b..0f8a3439902d 100644 --- a/arch/arm/include/asm/unwind.h +++ b/arch/arm/include/asm/unwind.h @@ -36,7 +36,8 @@ extern struct unwind_table *unwind_table_add(unsigned long start, unsigned long text_addr, unsigned long text_size); extern void unwind_table_del(struct unwind_table *tab); -extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk); +extern void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl); #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index cd1234c103fc..98ca3e3fa847 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -98,8 +98,8 @@ void set_fiq_handler(void *start, unsigned int length) memcpy(base + offset, start, length); if (!cache_is_vipt_nonaliasing()) - flush_icache_range((unsigned long)base + offset, offset + - length); + flush_icache_range((unsigned long)base + offset, + (unsigned long)base + offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); } diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index c49b39340ddb..f8904227e7fd 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -10,6 +10,7 @@ */ #include #include +#include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include #if defined(CONFIG_DEBUG_LL) && !defined(CONFIG_DEBUG_SEMIHOSTING) #include CONFIG_DEBUG_LL_INCLUDE diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e8..974b6c64d3e6 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24e..e15444b25ca0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 46e478fb5ea2..58eaa1f60e16 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -431,7 +431,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) npages = 1; /* for sigpage */ npages += vdso_total_pages; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; hint = sigpage_addr(mm, npages); addr = get_unmapped_area(NULL, hint, npages << PAGE_SHIFT, 0, 0); @@ -458,7 +458,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) arm_install_vdso(mm, addr + PAGE_SIZE); up_fail: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } #endif diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff635..d0f7c8896c96 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a8..9a6432557871 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index d08099269e35..d2c9338d74e8 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -2,12 +2,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index e640871328c1..6166ba38bf99 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -97,12 +97,12 @@ static void set_segfault(struct pt_regs *regs, unsigned long addr) { int si_code; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); if (find_vma(current->mm, addr) == NULL) si_code = SEGV_MAPERR; else si_code = SEGV_ACCERR; - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); pr_debug("SWP{B} emulation: access caused memory abort!\n"); arm_notify_die("Illegal memory access", regs, diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0f..65a3b1e75480 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -62,21 +62,24 @@ __setup("user_debug=", user_debug_setup); static void dump_mem(const char *, const char *, unsigned long, unsigned long); -void dump_backtrace_entry(unsigned long where, unsigned long from, unsigned long frame) +void dump_backtrace_entry(unsigned long where, unsigned long from, + unsigned long frame, const char *loglvl) { unsigned long end = frame + 4 + sizeof(struct pt_regs); #ifdef CONFIG_KALLSYMS - printk("[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", where, (void *)where, from, (void *)from); + printk("%s[<%08lx>] (%ps) from [<%08lx>] (%pS)\n", + loglvl, where, (void *)where, from, (void *)from); #else - printk("Function entered at [<%08lx>] from [<%08lx>]\n", where, from); + printk("%sFunction entered at [<%08lx>] from [<%08lx>]\n", + loglvl, where, from); #endif if (in_entry_text(from) && end <= ALIGN(frame, THREAD_SIZE)) - dump_mem("", "Exception stack", frame + 4, end); + dump_mem(loglvl, "Exception stack", frame + 4, end); } -void dump_backtrace_stm(u32 *stack, u32 instruction) +void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) { char str[80], *p; unsigned int x; @@ -88,12 +91,12 @@ void dump_backtrace_stm(u32 *stack, u32 instruction) if (++x == 6) { x = 0; p = str; - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } } } if (p != str) - printk("%s\n", str); + printk("%s%s\n", loglvl, str); } #ifndef CONFIG_ARM_UNWIND @@ -201,17 +204,19 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) } #ifdef CONFIG_ARM_UNWIND -static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static inline void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { - unwind_backtrace(regs, tsk); + unwind_backtrace(regs, tsk, loglvl); } #else -static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) +static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { unsigned int fp, mode; int ok = 1; - printk("Backtrace: "); + printk("%sBacktrace: ", loglvl); if (!tsk) tsk = current; @@ -238,13 +243,13 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) pr_cont("\n"); if (ok) - c_backtrace(fp, mode); + c_backtrace(fp, mode, loglvl); } #endif -void show_stack(struct task_struct *tsk, unsigned long *sp) +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) { - dump_backtrace(NULL, tsk); + dump_backtrace(NULL, tsk, loglvl); barrier(); } @@ -288,7 +293,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, THREAD_SIZE + (unsigned long)task_stack_page(tsk)); - dump_backtrace(regs, tsk); + dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); } @@ -566,7 +571,7 @@ __do_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = flush_cache_user_range(start, start + chunk); + ret = flush_icache_user_range(start, start + chunk); if (ret) return ret; @@ -663,10 +668,10 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) if (user_debug & UDBG_SYSCALL) { pr_err("[%d] %s: arm syscall %d\n", task_pid_nr(current), current->comm, no); - dump_instr("", regs); + dump_instr(KERN_ERR, regs); if (user_mode(regs)) { __show_regs(regs); - c_backtrace(frame_pointer(regs), processor_mode(regs)); + c_backtrace(frame_pointer(regs), processor_mode(regs), KERN_ERR); } } #endif diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 11a964fd66f4..d2bd0df2318d 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -455,7 +455,8 @@ int unwind_frame(struct stackframe *frame) return URC_OK; } -void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) +void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) { struct stackframe frame; @@ -493,7 +494,7 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk) urc = unwind_frame(&frame); if (urc < 0) break; - dump_backtrace_entry(where, frame.pc, frame.sp - 4); + dump_backtrace_entry(where, frame.pc, frame.sp - 4, loglvl); } } diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index e0330a25e1c6..6bfdca4769a7 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -240,7 +240,7 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr) return PTR_ERR_OR_ZERO(vma); } -/* assumes mmap_sem is write-locked */ +/* assumes mmap_lock is write-locked */ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 88a720da443b..7f24bc08403e 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -8,13 +8,13 @@ #include "vmlinux-xip.lds.S" #else +#include #include #include #include #include #include #include -#include #include "vmlinux.lds.h" diff --git a/arch/arm/lib/backtrace-clang.S b/arch/arm/lib/backtrace-clang.S index 2ff375144b55..6174c45f53a5 100644 --- a/arch/arm/lib/backtrace-clang.S +++ b/arch/arm/lib/backtrace-clang.S @@ -17,6 +17,7 @@ #define sv_pc r6 #define mask r7 #define sv_lr r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -99,6 +100,7 @@ ENDPROC(c_backtrace) @ to ensure 8 byte alignment movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? moveq mask, #0xfc000003 movne mask, #0 @ mask for 32-bit @@ -167,6 +169,7 @@ finished_setup: mov r1, sv_lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry /* @@ -183,6 +186,7 @@ finished_setup: ldr r0, [frame] @ locals are stored in @ the preceding frame subeq r0, r0, #4 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers /* @@ -196,7 +200,8 @@ finished_setup: bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk no_frame: ldmfd sp!, {r4 - r9, fp, pc} ENDPROC(c_backtrace) @@ -209,7 +214,7 @@ ENDPROC(c_backtrace) .long 1005b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Lopcode: .word 0xe92d4800 >> 11 @ stmfd sp!, {... fp, lr} .word 0x0b000000 @ bl if these bits are set diff --git a/arch/arm/lib/backtrace.S b/arch/arm/lib/backtrace.S index 582925238d65..872f658638d9 100644 --- a/arch/arm/lib/backtrace.S +++ b/arch/arm/lib/backtrace.S @@ -18,6 +18,7 @@ #define sv_pc r6 #define mask r7 #define offset r8 +#define loglvl r9 ENTRY(c_backtrace) @@ -25,9 +26,10 @@ ENTRY(c_backtrace) ret lr ENDPROC(c_backtrace) #else - stmfd sp!, {r4 - r8, lr} @ Save an extra register so we have a location... + stmfd sp!, {r4 - r9, lr} @ Save an extra register so we have a location... movs frame, r0 @ if frame pointer is zero beq no_frame @ we have no stack frames + mov loglvl, r2 tst r1, #0x10 @ 26 or 32-bit mode? ARM( moveq mask, #0xfc000003 ) @@ -73,6 +75,7 @@ for_each_frame: tst frame, mask @ Check for address exceptions ldr r1, [frame, #-4] @ get saved lr mov r2, frame bic r1, r1, mask @ mask PC/LR for the mode + mov r3, loglvl bl dump_backtrace_entry ldr r1, [sv_pc, #-4] @ if stmfd sp!, {args} exists, @@ -80,12 +83,14 @@ for_each_frame: tst frame, mask @ Check for address exceptions teq r3, r1, lsr #11 ldreq r0, [frame, #-8] @ get sp subeq r0, r0, #4 @ point at the last arg + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers 1004: ldr r1, [sv_pc, #0] @ if stmfd sp!, {..., fp, ip, lr, pc} ldr r3, .Ldsi @ instruction exists, teq r3, r1, lsr #11 subeq r0, frame, #16 + mov r2, loglvl bleq dump_backtrace_stm @ dump saved registers teq sv_fp, #0 @ zero saved fp means @@ -96,9 +101,10 @@ for_each_frame: tst frame, mask @ Check for address exceptions bhi for_each_frame 1006: adr r0, .Lbad - mov r1, frame + mov r1, loglvl + mov r2, frame bl printk -no_frame: ldmfd sp!, {r4 - r8, pc} +no_frame: ldmfd sp!, {r4 - r9, pc} ENDPROC(c_backtrace) .pushsection __ex_table,"a" @@ -109,7 +115,7 @@ ENDPROC(c_backtrace) .long 1004b, 1006b .popsection -.Lbad: .asciz "Backtrace aborted due to bad frame pointer <%p>\n" +.Lbad: .asciz "%sBacktrace aborted due to bad frame pointer <%p>\n" .align .Ldsi: .word 0xe92dd800 >> 11 @ stmfd sp!, {... fp, ip, lr, pc} .word 0xe92d0000 >> 11 @ stmfd sp!, {} diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index d72b14c96670..106f83a5ea6d 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -101,7 +101,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) atomic = faulthandler_disabled(); if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; @@ -109,11 +109,11 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) while (!pin_page_for_write(to, &pte, &ptl)) { if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)to)) goto out; if (!atomic) - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1; @@ -133,7 +133,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n) spin_unlock(ptl); } if (!atomic) - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; @@ -170,17 +170,17 @@ __clear_user_memset(void __user *addr, unsigned long n) return 0; } - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); while (n) { pte_t *pte; spinlock_t *ptl; int tocopy; while (!pin_page_for_write(addr, &pte, &ptl)) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (__put_user(0, (char __user *)addr)) goto out; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); } tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1; @@ -198,7 +198,7 @@ __clear_user_memset(void __user *addr, unsigned long n) else spin_unlock(ptl); } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: return n; diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759..5960e3dfd2bf 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d..eee095f0e2f6 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include