Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: "257 patches. Subsystems affected by this patch series: scripts, ocfs2, vfs, and mm (slab-generic, slab, slub, kconfig, dax, kasan, debug, pagecache, gup, swap, memcg, pagemap, mprotect, mremap, iomap, tracing, vmalloc, pagealloc, memory-failure, hugetlb, userfaultfd, vmscan, tools, memblock, oom-kill, hugetlbfs, migration, thp, readahead, nommu, ksm, vmstat, madvise, memory-hotplug, rmap, zsmalloc, highmem, zram, cleanups, kfence, and damon)" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (257 commits) mm/damon: remove return value from before_terminate callback mm/damon: fix a few spelling mistakes in comments and a pr_debug message mm/damon: simplify stop mechanism Docs/admin-guide/mm/pagemap: wordsmith page flags descriptions Docs/admin-guide/mm/damon/start: simplify the content Docs/admin-guide/mm/damon/start: fix a wrong link Docs/admin-guide/mm/damon/start: fix wrong example commands mm/damon/dbgfs: add adaptive_targets list check before enable monitor_on mm/damon: remove unnecessary variable initialization Documentation/admin-guide/mm/damon: add a document for DAMON_RECLAIM mm/damon: introduce DAMON-based Reclamation (DAMON_RECLAIM) selftests/damon: support watermarks mm/damon/dbgfs: support watermarks mm/damon/schemes: activate schemes based on a watermarks mechanism tools/selftests/damon: update for regions prioritization of schemes mm/damon/dbgfs: support prioritization weights mm/damon/vaddr,paddr: support pageout prioritization mm/damon/schemes: prioritize regions within the quotas mm/damon/selftests: support schemes quotas mm/damon/dbgfs: support quotas of schemes ...
This commit is contained in:
Коммит
512b7931ad
|
@ -328,6 +328,14 @@ as idle::
|
||||||
From now on, any pages on zram are idle pages. The idle mark
|
From now on, any pages on zram are idle pages. The idle mark
|
||||||
will be removed until someone requests access of the block.
|
will be removed until someone requests access of the block.
|
||||||
IOW, unless there is access request, those pages are still idle pages.
|
IOW, unless there is access request, those pages are still idle pages.
|
||||||
|
Additionally, when CONFIG_ZRAM_MEMORY_TRACKING is enabled pages can be
|
||||||
|
marked as idle based on how long (in seconds) it's been since they were
|
||||||
|
last accessed::
|
||||||
|
|
||||||
|
echo 86400 > /sys/block/zramX/idle
|
||||||
|
|
||||||
|
In this example all pages which haven't been accessed in more than 86400
|
||||||
|
seconds (one day) will be marked idle.
|
||||||
|
|
||||||
Admin can request writeback of those idle pages at right timing via::
|
Admin can request writeback of those idle pages at right timing via::
|
||||||
|
|
||||||
|
|
|
@ -87,10 +87,8 @@ Brief summary of control files.
|
||||||
memory.oom_control set/show oom controls.
|
memory.oom_control set/show oom controls.
|
||||||
memory.numa_stat show the number of memory usage per numa
|
memory.numa_stat show the number of memory usage per numa
|
||||||
node
|
node
|
||||||
memory.kmem.limit_in_bytes set/show hard limit for kernel memory
|
memory.kmem.limit_in_bytes This knob is deprecated and writing to
|
||||||
This knob is deprecated and shouldn't be
|
it will return -ENOTSUPP.
|
||||||
used. It is planned that this be removed in
|
|
||||||
the foreseeable future.
|
|
||||||
memory.kmem.usage_in_bytes show current kernel memory allocation
|
memory.kmem.usage_in_bytes show current kernel memory allocation
|
||||||
memory.kmem.failcnt show the number of kernel memory usage
|
memory.kmem.failcnt show the number of kernel memory usage
|
||||||
hits limits
|
hits limits
|
||||||
|
@ -518,11 +516,6 @@ will be charged as a new owner of it.
|
||||||
charged file caches. Some out-of-use page caches may keep charged until
|
charged file caches. Some out-of-use page caches may keep charged until
|
||||||
memory pressure happens. If you want to avoid that, force_empty will be useful.
|
memory pressure happens. If you want to avoid that, force_empty will be useful.
|
||||||
|
|
||||||
Also, note that when memory.kmem.limit_in_bytes is set the charges due to
|
|
||||||
kernel pages will still be seen. This is not considered a failure and the
|
|
||||||
write will still return success. In this case, it is expected that
|
|
||||||
memory.kmem.usage_in_bytes == memory.usage_in_bytes.
|
|
||||||
|
|
||||||
5.2 stat file
|
5.2 stat file
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
|
|
@ -1582,8 +1582,10 @@
|
||||||
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
||||||
|
|
||||||
hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
|
hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation
|
||||||
of gigantic hugepages.
|
of gigantic hugepages. Or using node format, the size
|
||||||
Format: nn[KMGTPE]
|
of a CMA area per node can be specified.
|
||||||
|
Format: nn[KMGTPE] or (node format)
|
||||||
|
<node>:nn[KMGTPE][,<node>:nn[KMGTPE]]
|
||||||
|
|
||||||
Reserve a CMA area of given size and allocate gigantic
|
Reserve a CMA area of given size and allocate gigantic
|
||||||
hugepages using the CMA allocator. If enabled, the
|
hugepages using the CMA allocator. If enabled, the
|
||||||
|
@ -1594,9 +1596,11 @@
|
||||||
the number of pages of hugepagesz to be allocated.
|
the number of pages of hugepagesz to be allocated.
|
||||||
If this is the first HugeTLB parameter on the command
|
If this is the first HugeTLB parameter on the command
|
||||||
line, it specifies the number of pages to allocate for
|
line, it specifies the number of pages to allocate for
|
||||||
the default huge page size. See also
|
the default huge page size. If using node format, the
|
||||||
Documentation/admin-guide/mm/hugetlbpage.rst.
|
number of pages to allocate per-node can be specified.
|
||||||
Format: <integer>
|
See also Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||||
|
Format: <integer> or (node format)
|
||||||
|
<node>:<integer>[,<node>:<integer>]
|
||||||
|
|
||||||
hugepagesz=
|
hugepagesz=
|
||||||
[HW] The size of the HugeTLB pages. This is used in
|
[HW] The size of the HugeTLB pages. This is used in
|
||||||
|
|
|
@ -13,3 +13,4 @@ optimize those.
|
||||||
|
|
||||||
start
|
start
|
||||||
usage
|
usage
|
||||||
|
reclaim
|
||||||
|
|
|
@ -0,0 +1,235 @@
|
||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=======================
|
||||||
|
DAMON-based Reclamation
|
||||||
|
=======================
|
||||||
|
|
||||||
|
DAMON-based Reclamation (DAMON_RECLAIM) is a static kernel module that aimed to
|
||||||
|
be used for proactive and lightweight reclamation under light memory pressure.
|
||||||
|
It doesn't aim to replace the LRU-list based page_granularity reclamation, but
|
||||||
|
to be selectively used for different level of memory pressure and requirements.
|
||||||
|
|
||||||
|
Where Proactive Reclamation is Required?
|
||||||
|
========================================
|
||||||
|
|
||||||
|
On general memory over-committed systems, proactively reclaiming cold pages
|
||||||
|
helps saving memory and reducing latency spikes that incurred by the direct
|
||||||
|
reclaim of the process or CPU consumption of kswapd, while incurring only
|
||||||
|
minimal performance degradation [1]_ [2]_ .
|
||||||
|
|
||||||
|
Free Pages Reporting [3]_ based memory over-commit virtualization systems are
|
||||||
|
good example of the cases. In such systems, the guest VMs reports their free
|
||||||
|
memory to host, and the host reallocates the reported memory to other guests.
|
||||||
|
As a result, the memory of the systems are fully utilized. However, the
|
||||||
|
guests could be not so memory-frugal, mainly because some kernel subsystems and
|
||||||
|
user-space applications are designed to use as much memory as available. Then,
|
||||||
|
guests could report only small amount of memory as free to host, results in
|
||||||
|
memory utilization drop of the systems. Running the proactive reclamation in
|
||||||
|
guests could mitigate this problem.
|
||||||
|
|
||||||
|
How It Works?
|
||||||
|
=============
|
||||||
|
|
||||||
|
DAMON_RECLAIM finds memory regions that didn't accessed for specific time
|
||||||
|
duration and page out. To avoid it consuming too much CPU for the paging out
|
||||||
|
operation, a speed limit can be configured. Under the speed limit, it pages
|
||||||
|
out memory regions that didn't accessed longer time first. System
|
||||||
|
administrators can also configure under what situation this scheme should
|
||||||
|
automatically activated and deactivated with three memory pressure watermarks.
|
||||||
|
|
||||||
|
Interface: Module Parameters
|
||||||
|
============================
|
||||||
|
|
||||||
|
To use this feature, you should first ensure your system is running on a kernel
|
||||||
|
that is built with ``CONFIG_DAMON_RECLAIM=y``.
|
||||||
|
|
||||||
|
To let sysadmins enable or disable it and tune for the given system,
|
||||||
|
DAMON_RECLAIM utilizes module parameters. That is, you can put
|
||||||
|
``damon_reclaim.<parameter>=<value>`` on the kernel boot command line or write
|
||||||
|
proper values to ``/sys/modules/damon_reclaim/parameters/<parameter>`` files.
|
||||||
|
|
||||||
|
Note that the parameter values except ``enabled`` are applied only when
|
||||||
|
DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in
|
||||||
|
runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable
|
||||||
|
it via ``enabled`` parameter file. Writing of the new values to proper
|
||||||
|
parameter values should be done before the re-enablement.
|
||||||
|
|
||||||
|
Below are the description of each parameter.
|
||||||
|
|
||||||
|
enabled
|
||||||
|
-------
|
||||||
|
|
||||||
|
Enable or disable DAMON_RECLAIM.
|
||||||
|
|
||||||
|
You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
|
||||||
|
Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do
|
||||||
|
no real monitoring and reclamation due to the watermarks-based activation
|
||||||
|
condition. Refer to below descriptions for the watermarks parameter for this.
|
||||||
|
|
||||||
|
min_age
|
||||||
|
-------
|
||||||
|
|
||||||
|
Time threshold for cold memory regions identification in microseconds.
|
||||||
|
|
||||||
|
If a memory region is not accessed for this or longer time, DAMON_RECLAIM
|
||||||
|
identifies the region as cold, and reclaims it.
|
||||||
|
|
||||||
|
120 seconds by default.
|
||||||
|
|
||||||
|
quota_ms
|
||||||
|
--------
|
||||||
|
|
||||||
|
Limit of time for the reclamation in milliseconds.
|
||||||
|
|
||||||
|
DAMON_RECLAIM tries to use only up to this time within a time window
|
||||||
|
(quota_reset_interval_ms) for trying reclamation of cold pages. This can be
|
||||||
|
used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, the
|
||||||
|
limit is disabled.
|
||||||
|
|
||||||
|
10 ms by default.
|
||||||
|
|
||||||
|
quota_sz
|
||||||
|
--------
|
||||||
|
|
||||||
|
Limit of size of memory for the reclamation in bytes.
|
||||||
|
|
||||||
|
DAMON_RECLAIM charges amount of memory which it tried to reclaim within a time
|
||||||
|
window (quota_reset_interval_ms) and makes no more than this limit is tried.
|
||||||
|
This can be used for limiting consumption of CPU and IO. If this value is
|
||||||
|
zero, the limit is disabled.
|
||||||
|
|
||||||
|
128 MiB by default.
|
||||||
|
|
||||||
|
quota_reset_interval_ms
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The time/size quota charge reset interval in milliseconds.
|
||||||
|
|
||||||
|
The charget reset interval for the quota of time (quota_ms) and size
|
||||||
|
(quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
|
||||||
|
quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
|
||||||
|
milliseconds.
|
||||||
|
|
||||||
|
1 second by default.
|
||||||
|
|
||||||
|
wmarks_interval
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
|
||||||
|
enabled but inactive due to its watermarks rule.
|
||||||
|
|
||||||
|
wmarks_high
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Free memory rate (per thousand) for the high watermark.
|
||||||
|
|
||||||
|
If free memory of the system in bytes per thousand bytes is higher than this,
|
||||||
|
DAMON_RECLAIM becomes inactive, so it does nothing but only periodically checks
|
||||||
|
the watermarks.
|
||||||
|
|
||||||
|
wmarks_mid
|
||||||
|
----------
|
||||||
|
|
||||||
|
Free memory rate (per thousand) for the middle watermark.
|
||||||
|
|
||||||
|
If free memory of the system in bytes per thousand bytes is between this and
|
||||||
|
the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring and
|
||||||
|
the reclaiming.
|
||||||
|
|
||||||
|
wmarks_low
|
||||||
|
----------
|
||||||
|
|
||||||
|
Free memory rate (per thousand) for the low watermark.
|
||||||
|
|
||||||
|
If free memory of the system in bytes per thousand bytes is lower than this,
|
||||||
|
DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks the
|
||||||
|
watermarks. In the case, the system falls back to the LRU-list based page
|
||||||
|
granularity reclamation logic.
|
||||||
|
|
||||||
|
sample_interval
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Sampling interval for the monitoring in microseconds.
|
||||||
|
|
||||||
|
The sampling interval of DAMON for the cold memory monitoring. Please refer to
|
||||||
|
the DAMON documentation (:doc:`usage`) for more detail.
|
||||||
|
|
||||||
|
aggr_interval
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Aggregation interval for the monitoring in microseconds.
|
||||||
|
|
||||||
|
The aggregation interval of DAMON for the cold memory monitoring. Please
|
||||||
|
refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||||
|
|
||||||
|
min_nr_regions
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Minimum number of monitoring regions.
|
||||||
|
|
||||||
|
The minimal number of monitoring regions of DAMON for the cold memory
|
||||||
|
monitoring. This can be used to set lower-bound of the monitoring quality.
|
||||||
|
But, setting this too high could result in increased monitoring overhead.
|
||||||
|
Please refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||||
|
|
||||||
|
max_nr_regions
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Maximum number of monitoring regions.
|
||||||
|
|
||||||
|
The maximum number of monitoring regions of DAMON for the cold memory
|
||||||
|
monitoring. This can be used to set upper-bound of the monitoring overhead.
|
||||||
|
However, setting this too low could result in bad monitoring quality. Please
|
||||||
|
refer to the DAMON documentation (:doc:`usage`) for more detail.
|
||||||
|
|
||||||
|
monitor_region_start
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Start of target memory region in physical address.
|
||||||
|
|
||||||
|
The start physical address of memory region that DAMON_RECLAIM will do work
|
||||||
|
against. That is, DAMON_RECLAIM will find cold memory regions in this region
|
||||||
|
and reclaims. By default, biggest System RAM is used as the region.
|
||||||
|
|
||||||
|
monitor_region_end
|
||||||
|
------------------
|
||||||
|
|
||||||
|
End of target memory region in physical address.
|
||||||
|
|
||||||
|
The end physical address of memory region that DAMON_RECLAIM will do work
|
||||||
|
against. That is, DAMON_RECLAIM will find cold memory regions in this region
|
||||||
|
and reclaims. By default, biggest System RAM is used as the region.
|
||||||
|
|
||||||
|
kdamond_pid
|
||||||
|
-----------
|
||||||
|
|
||||||
|
PID of the DAMON thread.
|
||||||
|
|
||||||
|
If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
|
||||||
|
-1.
|
||||||
|
|
||||||
|
Example
|
||||||
|
=======
|
||||||
|
|
||||||
|
Below runtime example commands make DAMON_RECLAIM to find memory regions that
|
||||||
|
not accessed for 30 seconds or more and pages out. The reclamation is limited
|
||||||
|
to be done only up to 1 GiB per second to avoid DAMON_RECLAIM consuming too
|
||||||
|
much CPU time for the paging out operation. It also asks DAMON_RECLAIM to do
|
||||||
|
nothing if the system's free memory rate is more than 50%, but start the real
|
||||||
|
works if it becomes lower than 40%. If DAMON_RECLAIM doesn't make progress and
|
||||||
|
therefore the free memory rate becomes lower than 20%, it asks DAMON_RECLAIM to
|
||||||
|
do nothing again, so that we can fall back to the LRU-list based page
|
||||||
|
granularity reclamation. ::
|
||||||
|
|
||||||
|
# cd /sys/modules/damon_reclaim/parameters
|
||||||
|
# echo 30000000 > min_age
|
||||||
|
# echo $((1 * 1024 * 1024 * 1024)) > quota_sz
|
||||||
|
# echo 1000 > quota_reset_interval_ms
|
||||||
|
# echo 500 > wmarks_high
|
||||||
|
# echo 400 > wmarks_mid
|
||||||
|
# echo 200 > wmarks_low
|
||||||
|
# echo Y > enabled
|
||||||
|
|
||||||
|
.. [1] https://research.google/pubs/pub48551/
|
||||||
|
.. [2] https://lwn.net/Articles/787611/
|
||||||
|
.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html
|
|
@ -6,39 +6,9 @@ Getting Started
|
||||||
|
|
||||||
This document briefly describes how you can use DAMON by demonstrating its
|
This document briefly describes how you can use DAMON by demonstrating its
|
||||||
default user space tool. Please note that this document describes only a part
|
default user space tool. Please note that this document describes only a part
|
||||||
of its features for brevity. Please refer to :doc:`usage` for more details.
|
of its features for brevity. Please refer to the usage `doc
|
||||||
|
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_ of the tool for more
|
||||||
|
details.
|
||||||
TL; DR
|
|
||||||
======
|
|
||||||
|
|
||||||
Follow the commands below to monitor and visualize the memory access pattern of
|
|
||||||
your workload. ::
|
|
||||||
|
|
||||||
# # build the kernel with CONFIG_DAMON_*=y, install it, and reboot
|
|
||||||
# mount -t debugfs none /sys/kernel/debug/
|
|
||||||
# git clone https://github.com/awslabs/damo
|
|
||||||
# ./damo/damo record $(pidof <your workload>)
|
|
||||||
# ./damo/damo report heat --plot_ascii
|
|
||||||
|
|
||||||
The final command draws the access heatmap of ``<your workload>``. The heatmap
|
|
||||||
shows which memory region (x-axis) is accessed when (y-axis) and how frequently
|
|
||||||
(number; the higher the more accesses have been observed). ::
|
|
||||||
|
|
||||||
111111111111111111111111111111111111111111111111111111110000
|
|
||||||
111121111111111111111111111111211111111111111111111111110000
|
|
||||||
000000000000000000000000000000000000000000000000001555552000
|
|
||||||
000000000000000000000000000000000000000000000222223555552000
|
|
||||||
000000000000000000000000000000000000000011111677775000000000
|
|
||||||
000000000000000000000000000000000000000488888000000000000000
|
|
||||||
000000000000000000000000000000000177888400000000000000000000
|
|
||||||
000000000000000000000000000046666522222100000000000000000000
|
|
||||||
000000000000000000000014444344444300000000000000000000000000
|
|
||||||
000000000000000002222245555510000000000000000000000000000000
|
|
||||||
# access_frequency: 0 1 2 3 4 5 6 7 8 9
|
|
||||||
# x-axis: space (140286319947776-140286426374096: 101.496 MiB)
|
|
||||||
# y-axis: time (605442256436361-605479951866441: 37.695430s)
|
|
||||||
# resolution: 60x10 (1.692 MiB and 3.770s for each character)
|
|
||||||
|
|
||||||
|
|
||||||
Prerequisites
|
Prerequisites
|
||||||
|
@ -91,24 +61,74 @@ pattern in the ``damon.data`` file.
|
||||||
Visualizing Recorded Patterns
|
Visualizing Recorded Patterns
|
||||||
=============================
|
=============================
|
||||||
|
|
||||||
The following three commands visualize the recorded access patterns and save
|
You can visualize the pattern in a heatmap, showing which memory region
|
||||||
the results as separate image files. ::
|
(x-axis) got accessed when (y-axis) and how frequently (number).::
|
||||||
|
|
||||||
$ damo report heats --heatmap access_pattern_heatmap.png
|
$ sudo damo report heats --heatmap stdout
|
||||||
$ damo report wss --range 0 101 1 --plot wss_dist.png
|
22222222222222222222222222222222222222211111111111111111111111111111111111111100
|
||||||
$ damo report wss --range 0 101 1 --sortby time --plot wss_chron_change.png
|
44444444444444444444444444444444444444434444444444444444444444444444444444443200
|
||||||
|
44444444444444444444444444444444444444433444444444444444444444444444444444444200
|
||||||
|
33333333333333333333333333333333333333344555555555555555555555555555555555555200
|
||||||
|
33333333333333333333333333333333333344444444444444444444444444444444444444444200
|
||||||
|
22222222222222222222222222222222222223355555555555555555555555555555555555555200
|
||||||
|
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||||
|
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||||
|
33333333333333333333333333333333333333355555555555555555555555555555555555555200
|
||||||
|
88888888888888888888888888888888888888600000000000000000000000000000000000000000
|
||||||
|
88888888888888888888888888888888888888600000000000000000000000000000000000000000
|
||||||
|
33333333333333333333333333333333333333444444444444444444444444444444444444443200
|
||||||
|
00000000000000000000000000000000000000288888888888888888888888888888888888888400
|
||||||
|
[...]
|
||||||
|
# access_frequency: 0 1 2 3 4 5 6 7 8 9
|
||||||
|
# x-axis: space (139728247021568-139728453431248: 196.848 MiB)
|
||||||
|
# y-axis: time (15256597248362-15326899978162: 1 m 10.303 s)
|
||||||
|
# resolution: 80x40 (2.461 MiB and 1.758 s for each character)
|
||||||
|
|
||||||
- ``access_pattern_heatmap.png`` will visualize the data access pattern in a
|
You can also visualize the distribution of the working set size, sorted by the
|
||||||
heatmap, showing which memory region (y-axis) got accessed when (x-axis)
|
size.::
|
||||||
and how frequently (color).
|
|
||||||
- ``wss_dist.png`` will show the distribution of the working set size.
|
|
||||||
- ``wss_chron_change.png`` will show how the working set size has
|
|
||||||
chronologically changed.
|
|
||||||
|
|
||||||
You can view the visualizations of this example workload at [1]_.
|
$ sudo damo report wss --range 0 101 10
|
||||||
Visualizations of other realistic workloads are available at [2]_ [3]_ [4]_.
|
# <percentile> <wss>
|
||||||
|
# target_id 18446632103789443072
|
||||||
|
# avr: 107.708 MiB
|
||||||
|
0 0 B | |
|
||||||
|
10 95.328 MiB |**************************** |
|
||||||
|
20 95.332 MiB |**************************** |
|
||||||
|
30 95.340 MiB |**************************** |
|
||||||
|
40 95.387 MiB |**************************** |
|
||||||
|
50 95.387 MiB |**************************** |
|
||||||
|
60 95.398 MiB |**************************** |
|
||||||
|
70 95.398 MiB |**************************** |
|
||||||
|
80 95.504 MiB |**************************** |
|
||||||
|
90 190.703 MiB |********************************************************* |
|
||||||
|
100 196.875 MiB |***********************************************************|
|
||||||
|
|
||||||
.. [1] https://damonitor.github.io/doc/html/v17/admin-guide/mm/damon/start.html#visualizing-recorded-patterns
|
Using ``--sortby`` option with the above command, you can show how the working
|
||||||
.. [2] https://damonitor.github.io/test/result/visual/latest/rec.heatmap.1.png.html
|
set size has chronologically changed.::
|
||||||
.. [3] https://damonitor.github.io/test/result/visual/latest/rec.wss_sz.png.html
|
|
||||||
.. [4] https://damonitor.github.io/test/result/visual/latest/rec.wss_time.png.html
|
$ sudo damo report wss --range 0 101 10 --sortby time
|
||||||
|
# <percentile> <wss>
|
||||||
|
# target_id 18446632103789443072
|
||||||
|
# avr: 107.708 MiB
|
||||||
|
0 3.051 MiB | |
|
||||||
|
10 190.703 MiB |***********************************************************|
|
||||||
|
20 95.336 MiB |***************************** |
|
||||||
|
30 95.328 MiB |***************************** |
|
||||||
|
40 95.387 MiB |***************************** |
|
||||||
|
50 95.332 MiB |***************************** |
|
||||||
|
60 95.320 MiB |***************************** |
|
||||||
|
70 95.398 MiB |***************************** |
|
||||||
|
80 95.398 MiB |***************************** |
|
||||||
|
90 95.340 MiB |***************************** |
|
||||||
|
100 95.398 MiB |***************************** |
|
||||||
|
|
||||||
|
|
||||||
|
Data Access Pattern Aware Memory Management
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
Below three commands make every memory region of size >=4K that doesn't
|
||||||
|
accessed for >=60 seconds in your workload to be swapped out. ::
|
||||||
|
|
||||||
|
$ echo "#min-size max-size min-acc max-acc min-age max-age action" > test_scheme
|
||||||
|
$ echo "4K max 0 0 60s max pageout" >> test_scheme
|
||||||
|
$ damo schemes -c test_scheme <pid of your workload>
|
||||||
|
|
|
@ -10,15 +10,16 @@ DAMON provides below three interfaces for different users.
|
||||||
This is for privileged people such as system administrators who want a
|
This is for privileged people such as system administrators who want a
|
||||||
just-working human-friendly interface. Using this, users can use the DAMON’s
|
just-working human-friendly interface. Using this, users can use the DAMON’s
|
||||||
major features in a human-friendly way. It may not be highly tuned for
|
major features in a human-friendly way. It may not be highly tuned for
|
||||||
special cases, though. It supports only virtual address spaces monitoring.
|
special cases, though. It supports both virtual and physical address spaces
|
||||||
|
monitoring.
|
||||||
- *debugfs interface.*
|
- *debugfs interface.*
|
||||||
This is for privileged user space programmers who want more optimized use of
|
This is for privileged user space programmers who want more optimized use of
|
||||||
DAMON. Using this, users can use DAMON’s major features by reading
|
DAMON. Using this, users can use DAMON’s major features by reading
|
||||||
from and writing to special debugfs files. Therefore, you can write and use
|
from and writing to special debugfs files. Therefore, you can write and use
|
||||||
your personalized DAMON debugfs wrapper programs that reads/writes the
|
your personalized DAMON debugfs wrapper programs that reads/writes the
|
||||||
debugfs files instead of you. The DAMON user space tool is also a reference
|
debugfs files instead of you. The DAMON user space tool is also a reference
|
||||||
implementation of such programs. It supports only virtual address spaces
|
implementation of such programs. It supports both virtual and physical
|
||||||
monitoring.
|
address spaces monitoring.
|
||||||
- *Kernel Space Programming Interface.*
|
- *Kernel Space Programming Interface.*
|
||||||
This is for kernel space programmers. Using this, users can utilize every
|
This is for kernel space programmers. Using this, users can utilize every
|
||||||
feature of DAMON most flexibly and efficiently by writing kernel space
|
feature of DAMON most flexibly and efficiently by writing kernel space
|
||||||
|
@ -34,8 +35,9 @@ the reason, this document describes only the debugfs interface
|
||||||
debugfs Interface
|
debugfs Interface
|
||||||
=================
|
=================
|
||||||
|
|
||||||
DAMON exports three files, ``attrs``, ``target_ids``, and ``monitor_on`` under
|
DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||||
its debugfs directory, ``<debugfs>/damon/``.
|
``schemes`` and ``monitor_on`` under its debugfs directory,
|
||||||
|
``<debugfs>/damon/``.
|
||||||
|
|
||||||
|
|
||||||
Attributes
|
Attributes
|
||||||
|
@ -71,9 +73,106 @@ check it again::
|
||||||
# cat target_ids
|
# cat target_ids
|
||||||
42 4242
|
42 4242
|
||||||
|
|
||||||
|
Users can also monitor the physical memory address space of the system by
|
||||||
|
writing a special keyword, "``paddr\n``" to the file. Because physical address
|
||||||
|
space monitoring doesn't support multiple targets, reading the file will show a
|
||||||
|
fake value, ``42``, as below::
|
||||||
|
|
||||||
|
# cd <debugfs>/damon
|
||||||
|
# echo paddr > target_ids
|
||||||
|
# cat target_ids
|
||||||
|
42
|
||||||
|
|
||||||
Note that setting the target ids doesn't start the monitoring.
|
Note that setting the target ids doesn't start the monitoring.
|
||||||
|
|
||||||
|
|
||||||
|
Initial Monitoring Target Regions
|
||||||
|
---------------------------------
|
||||||
|
|
||||||
|
In case of the virtual address space monitoring, DAMON automatically sets and
|
||||||
|
updates the monitoring target regions so that entire memory mappings of target
|
||||||
|
processes can be covered. However, users can want to limit the monitoring
|
||||||
|
region to specific address ranges, such as the heap, the stack, or specific
|
||||||
|
file-mapped area. Or, some users can know the initial access pattern of their
|
||||||
|
workloads and therefore want to set optimal initial regions for the 'adaptive
|
||||||
|
regions adjustment'.
|
||||||
|
|
||||||
|
In contrast, DAMON do not automatically sets and updates the monitoring target
|
||||||
|
regions in case of physical memory monitoring. Therefore, users should set the
|
||||||
|
monitoring target regions by themselves.
|
||||||
|
|
||||||
|
In such cases, users can explicitly set the initial monitoring target regions
|
||||||
|
as they want, by writing proper values to the ``init_regions`` file. Each line
|
||||||
|
of the input should represent one region in below form.::
|
||||||
|
|
||||||
|
<target id> <start address> <end address>
|
||||||
|
|
||||||
|
The ``target id`` should already in ``target_ids`` file, and the regions should
|
||||||
|
be passed in address order. For example, below commands will set a couple of
|
||||||
|
address ranges, ``1-100`` and ``100-200`` as the initial monitoring target
|
||||||
|
region of process 42, and another couple of address ranges, ``20-40`` and
|
||||||
|
``50-100`` as that of process 4242.::
|
||||||
|
|
||||||
|
# cd <debugfs>/damon
|
||||||
|
# echo "42 1 100
|
||||||
|
42 100 200
|
||||||
|
4242 20 40
|
||||||
|
4242 50 100" > init_regions
|
||||||
|
|
||||||
|
Note that this sets the initial monitoring target regions only. In case of
|
||||||
|
virtual memory monitoring, DAMON will automatically updates the boundary of the
|
||||||
|
regions after one ``regions update interval``. Therefore, users should set the
|
||||||
|
``regions update interval`` large enough in this case, if they don't want the
|
||||||
|
update.
|
||||||
|
|
||||||
|
|
||||||
|
Schemes
|
||||||
|
-------
|
||||||
|
|
||||||
|
For usual DAMON-based data access aware memory management optimizations, users
|
||||||
|
would simply want the system to apply a memory management action to a memory
|
||||||
|
region of a specific size having a specific access frequency for a specific
|
||||||
|
time. DAMON receives such formalized operation schemes from the user and
|
||||||
|
applies those to the target processes. It also counts the total number and
|
||||||
|
size of regions that each scheme is applied. This statistics can be used for
|
||||||
|
online analysis or tuning of the schemes.
|
||||||
|
|
||||||
|
Users can get and set the schemes by reading from and writing to ``schemes``
|
||||||
|
debugfs file. Reading the file also shows the statistics of each scheme. To
|
||||||
|
the file, each of the schemes should be represented in each line in below form:
|
||||||
|
|
||||||
|
min-size max-size min-acc max-acc min-age max-age action
|
||||||
|
|
||||||
|
Note that the ranges are closed interval. Bytes for the size of regions
|
||||||
|
(``min-size`` and ``max-size``), number of monitored accesses per aggregate
|
||||||
|
interval for access frequency (``min-acc`` and ``max-acc``), number of
|
||||||
|
aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
|
||||||
|
predefined integer for memory management actions should be used. The supported
|
||||||
|
numbers and their meanings are as below.
|
||||||
|
|
||||||
|
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||||
|
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||||
|
- 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``
|
||||||
|
- 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``
|
||||||
|
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||||
|
- 5: Do nothing but count the statistics
|
||||||
|
|
||||||
|
You can disable schemes by simply writing an empty string to the file. For
|
||||||
|
example, below commands applies a scheme saying "If a memory region of size in
|
||||||
|
[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||||
|
interval in [10, 20], page out the region", check the entered scheme again, and
|
||||||
|
finally remove the scheme. ::
|
||||||
|
|
||||||
|
# cd <debugfs>/damon
|
||||||
|
# echo "4096 8192 0 5 10 20 2" > schemes
|
||||||
|
# cat schemes
|
||||||
|
4096 8192 0 5 10 20 2 0 0
|
||||||
|
# echo > schemes
|
||||||
|
|
||||||
|
The last two integers in the 4th line of above example is the total number and
|
||||||
|
the total size of the regions that the scheme is applied.
|
||||||
|
|
||||||
|
|
||||||
Turning On/Off
|
Turning On/Off
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,9 @@ hugepages
|
||||||
implicitly specifies the number of huge pages of default size to
|
implicitly specifies the number of huge pages of default size to
|
||||||
allocate. If the number of huge pages of default size is implicitly
|
allocate. If the number of huge pages of default size is implicitly
|
||||||
specified, it can not be overwritten by a hugepagesz,hugepages
|
specified, it can not be overwritten by a hugepagesz,hugepages
|
||||||
parameter pair for the default size.
|
parameter pair for the default size. This parameter also has a
|
||||||
|
node format. The node format specifies the number of huge pages
|
||||||
|
to allocate on specific nodes.
|
||||||
|
|
||||||
For example, on an architecture with 2M default huge page size::
|
For example, on an architecture with 2M default huge page size::
|
||||||
|
|
||||||
|
@ -138,6 +140,14 @@ hugepages
|
||||||
indicating that the hugepages=512 parameter is ignored. If a hugepages
|
indicating that the hugepages=512 parameter is ignored. If a hugepages
|
||||||
parameter is preceded by an invalid hugepagesz parameter, it will
|
parameter is preceded by an invalid hugepagesz parameter, it will
|
||||||
be ignored.
|
be ignored.
|
||||||
|
|
||||||
|
Node format example::
|
||||||
|
|
||||||
|
hugepagesz=2M hugepages=0:1,1:2
|
||||||
|
|
||||||
|
It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
|
||||||
|
If the node number is invalid, the parameter will be ignored.
|
||||||
|
|
||||||
default_hugepagesz
|
default_hugepagesz
|
||||||
Specify the default huge page size. This parameter can
|
Specify the default huge page size. This parameter can
|
||||||
only be specified once on the command line. default_hugepagesz can
|
only be specified once on the command line. default_hugepagesz can
|
||||||
|
@ -234,8 +244,12 @@ will exist, of the form::
|
||||||
|
|
||||||
hugepages-${size}kB
|
hugepages-${size}kB
|
||||||
|
|
||||||
Inside each of these directories, the same set of files will exist::
|
Inside each of these directories, the set of files contained in ``/proc``
|
||||||
|
will exist. In addition, two additional interfaces for demoting huge
|
||||||
|
pages may exist::
|
||||||
|
|
||||||
|
demote
|
||||||
|
demote_size
|
||||||
nr_hugepages
|
nr_hugepages
|
||||||
nr_hugepages_mempolicy
|
nr_hugepages_mempolicy
|
||||||
nr_overcommit_hugepages
|
nr_overcommit_hugepages
|
||||||
|
@ -243,7 +257,29 @@ Inside each of these directories, the same set of files will exist::
|
||||||
resv_hugepages
|
resv_hugepages
|
||||||
surplus_hugepages
|
surplus_hugepages
|
||||||
|
|
||||||
which function as described above for the default huge page-sized case.
|
The demote interfaces provide the ability to split a huge page into
|
||||||
|
smaller huge pages. For example, the x86 architecture supports both
|
||||||
|
1GB and 2MB huge pages sizes. A 1GB huge page can be split into 512
|
||||||
|
2MB huge pages. Demote interfaces are not available for the smallest
|
||||||
|
huge page size. The demote interfaces are:
|
||||||
|
|
||||||
|
demote_size
|
||||||
|
is the size of demoted pages. When a page is demoted a corresponding
|
||||||
|
number of huge pages of demote_size will be created. By default,
|
||||||
|
demote_size is set to the next smaller huge page size. If there are
|
||||||
|
multiple smaller huge page sizes, demote_size can be set to any of
|
||||||
|
these smaller sizes. Only huge page sizes less than the current huge
|
||||||
|
pages size are allowed.
|
||||||
|
|
||||||
|
demote
|
||||||
|
is used to demote a number of huge pages. A user with root privileges
|
||||||
|
can write to this file. It may not be possible to demote the
|
||||||
|
requested number of huge pages. To determine how many pages were
|
||||||
|
actually demoted, compare the value of nr_hugepages before and after
|
||||||
|
writing to the demote interface. demote is a write only interface.
|
||||||
|
|
||||||
|
The interfaces which are the same as in ``/proc`` (all except demote and
|
||||||
|
demote_size) function as described above for the default huge page-sized case.
|
||||||
|
|
||||||
.. _mem_policy_and_hp_alloc:
|
.. _mem_policy_and_hp_alloc:
|
||||||
|
|
||||||
|
|
|
@ -37,5 +37,7 @@ the Linux memory management.
|
||||||
numaperf
|
numaperf
|
||||||
pagemap
|
pagemap
|
||||||
soft-dirty
|
soft-dirty
|
||||||
|
swap_numa
|
||||||
transhuge
|
transhuge
|
||||||
userfaultfd
|
userfaultfd
|
||||||
|
zswap
|
||||||
|
|
|
@ -165,9 +165,8 @@ Or alternatively::
|
||||||
|
|
||||||
% echo 1 > /sys/devices/system/memory/memoryXXX/online
|
% echo 1 > /sys/devices/system/memory/memoryXXX/online
|
||||||
|
|
||||||
The kernel will select the target zone automatically, usually defaulting to
|
The kernel will select the target zone automatically, depending on the
|
||||||
``ZONE_NORMAL`` unless ``movablecore=1`` has been specified on the kernel
|
configured ``online_policy``.
|
||||||
command line or if the memory block would intersect the ZONE_MOVABLE already.
|
|
||||||
|
|
||||||
One can explicitly request to associate an offline memory block with
|
One can explicitly request to associate an offline memory block with
|
||||||
ZONE_MOVABLE by::
|
ZONE_MOVABLE by::
|
||||||
|
@ -198,6 +197,9 @@ Auto-onlining can be enabled by writing ``online``, ``online_kernel`` or
|
||||||
|
|
||||||
% echo online > /sys/devices/system/memory/auto_online_blocks
|
% echo online > /sys/devices/system/memory/auto_online_blocks
|
||||||
|
|
||||||
|
Similarly to manual onlining, with ``online`` the kernel will select the
|
||||||
|
target zone automatically, depending on the configured ``online_policy``.
|
||||||
|
|
||||||
Modifying the auto-online behavior will only affect all subsequently added
|
Modifying the auto-online behavior will only affect all subsequently added
|
||||||
memory blocks only.
|
memory blocks only.
|
||||||
|
|
||||||
|
@ -393,11 +395,16 @@ command line parameters are relevant:
|
||||||
======================== =======================================================
|
======================== =======================================================
|
||||||
``memhp_default_state`` configure auto-onlining by essentially setting
|
``memhp_default_state`` configure auto-onlining by essentially setting
|
||||||
``/sys/devices/system/memory/auto_online_blocks``.
|
``/sys/devices/system/memory/auto_online_blocks``.
|
||||||
``movablecore`` configure automatic zone selection of the kernel. When
|
``movable_node`` configure automatic zone selection in the kernel when
|
||||||
set, the kernel will default to ZONE_MOVABLE, unless
|
using the ``contig-zones`` online policy. When
|
||||||
other zones can be kept contiguous.
|
set, the kernel will default to ZONE_MOVABLE when
|
||||||
|
onlining a memory block, unless other zones can be kept
|
||||||
|
contiguous.
|
||||||
======================== =======================================================
|
======================== =======================================================
|
||||||
|
|
||||||
|
See Documentation/admin-guide/kernel-parameters.txt for a more generic
|
||||||
|
description of these command line parameters.
|
||||||
|
|
||||||
Module Parameters
|
Module Parameters
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
@ -410,24 +417,118 @@ them with ``memory_hotplug.`` such as::
|
||||||
|
|
||||||
and they can be observed (and some even modified at runtime) via::
|
and they can be observed (and some even modified at runtime) via::
|
||||||
|
|
||||||
/sys/modules/memory_hotplug/parameters/
|
/sys/module/memory_hotplug/parameters/
|
||||||
|
|
||||||
The following module parameters are currently defined:
|
The following module parameters are currently defined:
|
||||||
|
|
||||||
======================== =======================================================
|
================================ ===============================================
|
||||||
``memmap_on_memory`` read-write: Allocate memory for the memmap from the
|
``memmap_on_memory`` read-write: Allocate memory for the memmap from
|
||||||
added memory block itself. Even if enabled, actual
|
the added memory block itself. Even if enabled,
|
||||||
support depends on various other system properties and
|
actual support depends on various other system
|
||||||
should only be regarded as a hint whether the behavior
|
properties and should only be regarded as a
|
||||||
would be desired.
|
hint whether the behavior would be desired.
|
||||||
|
|
||||||
While allocating the memmap from the memory block
|
While allocating the memmap from the memory
|
||||||
itself makes memory hotplug less likely to fail and
|
block itself makes memory hotplug less likely
|
||||||
keeps the memmap on the same NUMA node in any case, it
|
to fail and keeps the memmap on the same NUMA
|
||||||
can fragment physical memory in a way that huge pages
|
node in any case, it can fragment physical
|
||||||
in bigger granularity cannot be formed on hotplugged
|
memory in a way that huge pages in bigger
|
||||||
memory.
|
granularity cannot be formed on hotplugged
|
||||||
======================== =======================================================
|
memory.
|
||||||
|
``online_policy`` read-write: Set the basic policy used for
|
||||||
|
automatic zone selection when onlining memory
|
||||||
|
blocks without specifying a target zone.
|
||||||
|
``contig-zones`` has been the kernel default
|
||||||
|
before this parameter was added. After an
|
||||||
|
online policy was configured and memory was
|
||||||
|
online, the policy should not be changed
|
||||||
|
anymore.
|
||||||
|
|
||||||
|
When set to ``contig-zones``, the kernel will
|
||||||
|
try keeping zones contiguous. If a memory block
|
||||||
|
intersects multiple zones or no zone, the
|
||||||
|
behavior depends on the ``movable_node`` kernel
|
||||||
|
command line parameter: default to ZONE_MOVABLE
|
||||||
|
if set, default to the applicable kernel zone
|
||||||
|
(usually ZONE_NORMAL) if not set.
|
||||||
|
|
||||||
|
When set to ``auto-movable``, the kernel will
|
||||||
|
try onlining memory blocks to ZONE_MOVABLE if
|
||||||
|
possible according to the configuration and
|
||||||
|
memory device details. With this policy, one
|
||||||
|
can avoid zone imbalances when eventually
|
||||||
|
hotplugging a lot of memory later and still
|
||||||
|
wanting to be able to hotunplug as much as
|
||||||
|
possible reliably, very desirable in
|
||||||
|
virtualized environments. This policy ignores
|
||||||
|
the ``movable_node`` kernel command line
|
||||||
|
parameter and isn't really applicable in
|
||||||
|
environments that require it (e.g., bare metal
|
||||||
|
with hotunpluggable nodes) where hotplugged
|
||||||
|
memory might be exposed via the
|
||||||
|
firmware-provided memory map early during boot
|
||||||
|
to the system instead of getting detected,
|
||||||
|
added and onlined later during boot (such as
|
||||||
|
done by virtio-mem or by some hypervisors
|
||||||
|
implementing emulated DIMMs). As one example, a
|
||||||
|
hotplugged DIMM will be onlined either
|
||||||
|
completely to ZONE_MOVABLE or completely to
|
||||||
|
ZONE_NORMAL, not a mixture.
|
||||||
|
As another example, as many memory blocks
|
||||||
|
belonging to a virtio-mem device will be
|
||||||
|
onlined to ZONE_MOVABLE as possible,
|
||||||
|
special-casing units of memory blocks that can
|
||||||
|
only get hotunplugged together. *This policy
|
||||||
|
does not protect from setups that are
|
||||||
|
problematic with ZONE_MOVABLE and does not
|
||||||
|
change the zone of memory blocks dynamically
|
||||||
|
after they were onlined.*
|
||||||
|
``auto_movable_ratio`` read-write: Set the maximum MOVABLE:KERNEL
|
||||||
|
memory ratio in % for the ``auto-movable``
|
||||||
|
online policy. Whether the ratio applies only
|
||||||
|
for the system across all NUMA nodes or also
|
||||||
|
per NUMA nodes depends on the
|
||||||
|
``auto_movable_numa_aware`` configuration.
|
||||||
|
|
||||||
|
All accounting is based on present memory pages
|
||||||
|
in the zones combined with accounting per
|
||||||
|
memory device. Memory dedicated to the CMA
|
||||||
|
allocator is accounted as MOVABLE, although
|
||||||
|
residing on one of the kernel zones. The
|
||||||
|
possible ratio depends on the actual workload.
|
||||||
|
The kernel default is "301" %, for example,
|
||||||
|
allowing for hotplugging 24 GiB to a 8 GiB VM
|
||||||
|
and automatically onlining all hotplugged
|
||||||
|
memory to ZONE_MOVABLE in many setups. The
|
||||||
|
additional 1% deals with some pages being not
|
||||||
|
present, for example, because of some firmware
|
||||||
|
allocations.
|
||||||
|
|
||||||
|
Note that ZONE_NORMAL memory provided by one
|
||||||
|
memory device does not allow for more
|
||||||
|
ZONE_MOVABLE memory for a different memory
|
||||||
|
device. As one example, onlining memory of a
|
||||||
|
hotplugged DIMM to ZONE_NORMAL will not allow
|
||||||
|
for another hotplugged DIMM to get onlined to
|
||||||
|
ZONE_MOVABLE automatically. In contrast, memory
|
||||||
|
hotplugged by a virtio-mem device that got
|
||||||
|
onlined to ZONE_NORMAL will allow for more
|
||||||
|
ZONE_MOVABLE memory within *the same*
|
||||||
|
virtio-mem device.
|
||||||
|
``auto_movable_numa_aware`` read-write: Configure whether the
|
||||||
|
``auto_movable_ratio`` in the ``auto-movable``
|
||||||
|
online policy also applies per NUMA
|
||||||
|
node in addition to the whole system across all
|
||||||
|
NUMA nodes. The kernel default is "Y".
|
||||||
|
|
||||||
|
Disabling NUMA awareness can be helpful when
|
||||||
|
dealing with NUMA nodes that should be
|
||||||
|
completely hotunpluggable, onlining the memory
|
||||||
|
completely to ZONE_MOVABLE automatically if
|
||||||
|
possible.
|
||||||
|
|
||||||
|
Parameter availability depends on CONFIG_NUMA.
|
||||||
|
================================ ===============================================
|
||||||
|
|
||||||
ZONE_MOVABLE
|
ZONE_MOVABLE
|
||||||
============
|
============
|
||||||
|
|
|
@ -90,13 +90,14 @@ Short descriptions to the page flags
|
||||||
====================================
|
====================================
|
||||||
|
|
||||||
0 - LOCKED
|
0 - LOCKED
|
||||||
page is being locked for exclusive access, e.g. by undergoing read/write IO
|
The page is being locked for exclusive access, e.g. by undergoing read/write
|
||||||
|
IO.
|
||||||
7 - SLAB
|
7 - SLAB
|
||||||
page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
|
The page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator.
|
||||||
When compound page is used, SLUB/SLQB will only set this flag on the head
|
When compound page is used, SLUB/SLQB will only set this flag on the head
|
||||||
page; SLOB will not flag it at all.
|
page; SLOB will not flag it at all.
|
||||||
10 - BUDDY
|
10 - BUDDY
|
||||||
a free memory block managed by the buddy system allocator
|
A free memory block managed by the buddy system allocator.
|
||||||
The buddy system organizes free memory in blocks of various orders.
|
The buddy system organizes free memory in blocks of various orders.
|
||||||
An order N block has 2^N physically contiguous pages, with the BUDDY flag
|
An order N block has 2^N physically contiguous pages, with the BUDDY flag
|
||||||
set for and _only_ for the first page.
|
set for and _only_ for the first page.
|
||||||
|
@ -112,65 +113,65 @@ Short descriptions to the page flags
|
||||||
16 - COMPOUND_TAIL
|
16 - COMPOUND_TAIL
|
||||||
A compound page tail (see description above).
|
A compound page tail (see description above).
|
||||||
17 - HUGE
|
17 - HUGE
|
||||||
this is an integral part of a HugeTLB page
|
This is an integral part of a HugeTLB page.
|
||||||
19 - HWPOISON
|
19 - HWPOISON
|
||||||
hardware detected memory corruption on this page: don't touch the data!
|
Hardware detected memory corruption on this page: don't touch the data!
|
||||||
20 - NOPAGE
|
20 - NOPAGE
|
||||||
no page frame exists at the requested address
|
No page frame exists at the requested address.
|
||||||
21 - KSM
|
21 - KSM
|
||||||
identical memory pages dynamically shared between one or more processes
|
Identical memory pages dynamically shared between one or more processes.
|
||||||
22 - THP
|
22 - THP
|
||||||
contiguous pages which construct transparent hugepages
|
Contiguous pages which construct transparent hugepages.
|
||||||
23 - OFFLINE
|
23 - OFFLINE
|
||||||
page is logically offline
|
The page is logically offline.
|
||||||
24 - ZERO_PAGE
|
24 - ZERO_PAGE
|
||||||
zero page for pfn_zero or huge_zero page
|
Zero page for pfn_zero or huge_zero page.
|
||||||
25 - IDLE
|
25 - IDLE
|
||||||
page has not been accessed since it was marked idle (see
|
The page has not been accessed since it was marked idle (see
|
||||||
:ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
|
:ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
|
||||||
Note that this flag may be stale in case the page was accessed via
|
Note that this flag may be stale in case the page was accessed via
|
||||||
a PTE. To make sure the flag is up-to-date one has to read
|
a PTE. To make sure the flag is up-to-date one has to read
|
||||||
``/sys/kernel/mm/page_idle/bitmap`` first.
|
``/sys/kernel/mm/page_idle/bitmap`` first.
|
||||||
26 - PGTABLE
|
26 - PGTABLE
|
||||||
page is in use as a page table
|
The page is in use as a page table.
|
||||||
|
|
||||||
IO related page flags
|
IO related page flags
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
1 - ERROR
|
1 - ERROR
|
||||||
IO error occurred
|
IO error occurred.
|
||||||
3 - UPTODATE
|
3 - UPTODATE
|
||||||
page has up-to-date data
|
The page has up-to-date data.
|
||||||
ie. for file backed page: (in-memory data revision >= on-disk one)
|
ie. for file backed page: (in-memory data revision >= on-disk one)
|
||||||
4 - DIRTY
|
4 - DIRTY
|
||||||
page has been written to, hence contains new data
|
The page has been written to, hence contains new data.
|
||||||
i.e. for file backed page: (in-memory data revision > on-disk one)
|
i.e. for file backed page: (in-memory data revision > on-disk one)
|
||||||
8 - WRITEBACK
|
8 - WRITEBACK
|
||||||
page is being synced to disk
|
The page is being synced to disk.
|
||||||
|
|
||||||
LRU related page flags
|
LRU related page flags
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
5 - LRU
|
5 - LRU
|
||||||
page is in one of the LRU lists
|
The page is in one of the LRU lists.
|
||||||
6 - ACTIVE
|
6 - ACTIVE
|
||||||
page is in the active LRU list
|
The page is in the active LRU list.
|
||||||
18 - UNEVICTABLE
|
18 - UNEVICTABLE
|
||||||
page is in the unevictable (non-)LRU list It is somehow pinned and
|
The page is in the unevictable (non-)LRU list It is somehow pinned and
|
||||||
not a candidate for LRU page reclaims, e.g. ramfs pages,
|
not a candidate for LRU page reclaims, e.g. ramfs pages,
|
||||||
shmctl(SHM_LOCK) and mlock() memory segments
|
shmctl(SHM_LOCK) and mlock() memory segments.
|
||||||
2 - REFERENCED
|
2 - REFERENCED
|
||||||
page has been referenced since last LRU list enqueue/requeue
|
The page has been referenced since last LRU list enqueue/requeue.
|
||||||
9 - RECLAIM
|
9 - RECLAIM
|
||||||
page will be reclaimed soon after its pageout IO completed
|
The page will be reclaimed soon after its pageout IO completed.
|
||||||
11 - MMAP
|
11 - MMAP
|
||||||
a memory mapped page
|
A memory mapped page.
|
||||||
12 - ANON
|
12 - ANON
|
||||||
a memory mapped page that is not part of a file
|
A memory mapped page that is not part of a file.
|
||||||
13 - SWAPCACHE
|
13 - SWAPCACHE
|
||||||
page is mapped to swap space, i.e. has an associated swap entry
|
The page is mapped to swap space, i.e. has an associated swap entry.
|
||||||
14 - SWAPBACKED
|
14 - SWAPBACKED
|
||||||
page is backed by swap/RAM
|
The page is backed by swap/RAM.
|
||||||
|
|
||||||
The page-types tool in the tools/vm directory can be used to query the
|
The page-types tool in the tools/vm directory can be used to query the
|
||||||
above flags.
|
above flags.
|
||||||
|
|
|
@ -57,7 +57,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
|
||||||
unsigned long start_pfn;
|
unsigned long start_pfn;
|
||||||
unsigned long nr_pages;
|
unsigned long nr_pages;
|
||||||
int status_change_nid_normal;
|
int status_change_nid_normal;
|
||||||
int status_change_nid_high;
|
|
||||||
int status_change_nid;
|
int status_change_nid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,8 +64,6 @@ The third argument (arg) passes a pointer of struct memory_notify::
|
||||||
- nr_pages is # of pages of online/offline memory.
|
- nr_pages is # of pages of online/offline memory.
|
||||||
- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
|
- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
|
||||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
||||||
- status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
|
|
||||||
is (will be) set/clear, if this is -1, then nodemask status is not changed.
|
|
||||||
- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
|
- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
|
||||||
set/clear. It means a new(memoryless) node gets new memory by online and a
|
set/clear. It means a new(memoryless) node gets new memory by online and a
|
||||||
node loses all memory. If this is -1, then nodemask status is not changed.
|
node loses all memory. If this is -1, then nodemask status is not changed.
|
||||||
|
|
|
@ -231,10 +231,14 @@ Guarded allocations are set up based on the sample interval. After expiration
|
||||||
of the sample interval, the next allocation through the main allocator (SLAB or
|
of the sample interval, the next allocation through the main allocator (SLAB or
|
||||||
SLUB) returns a guarded allocation from the KFENCE object pool (allocation
|
SLUB) returns a guarded allocation from the KFENCE object pool (allocation
|
||||||
sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
|
sizes up to PAGE_SIZE are supported). At this point, the timer is reset, and
|
||||||
the next allocation is set up after the expiration of the interval. To "gate" a
|
the next allocation is set up after the expiration of the interval.
|
||||||
KFENCE allocation through the main allocator's fast-path without overhead,
|
|
||||||
KFENCE relies on static branches via the static keys infrastructure. The static
|
When using ``CONFIG_KFENCE_STATIC_KEYS=y``, KFENCE allocations are "gated"
|
||||||
branch is toggled to redirect the allocation to KFENCE.
|
through the main allocator's fast-path by relying on static branches via the
|
||||||
|
static keys infrastructure. The static branch is toggled to redirect the
|
||||||
|
allocation to KFENCE. Depending on sample interval, target workloads, and
|
||||||
|
system architecture, this may perform better than the simple dynamic branch.
|
||||||
|
Careful benchmarking is recommended.
|
||||||
|
|
||||||
KFENCE objects each reside on a dedicated page, at either the left or right
|
KFENCE objects each reside on a dedicated page, at either the left or right
|
||||||
page boundaries selected at random. The pages to the left and right of the
|
page boundaries selected at random. The pages to the left and right of the
|
||||||
|
@ -269,6 +273,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
|
||||||
first, and the chances of detecting use-after-frees of recently freed objects
|
first, and the chances of detecting use-after-frees of recently freed objects
|
||||||
is increased.
|
is increased.
|
||||||
|
|
||||||
|
If pool utilization reaches 75% (default) or above, to reduce the risk of the
|
||||||
|
pool eventually being fully occupied by allocated objects yet ensure diverse
|
||||||
|
coverage of allocations, KFENCE limits currently covered allocations of the
|
||||||
|
same source from further filling up the pool. The "source" of an allocation is
|
||||||
|
based on its partial allocation stack trace. A side-effect is that this also
|
||||||
|
limits frequent long-lived allocations (e.g. pagecache) of the same source
|
||||||
|
filling up the pool permanently, which is the most common risk for the pool
|
||||||
|
becoming full and the sampled allocation rate dropping to zero. The threshold
|
||||||
|
at which to start limiting currently covered allocations can be configured via
|
||||||
|
the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
|
||||||
|
|
||||||
Interface
|
Interface
|
||||||
---------
|
---------
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,6 @@ memory_notify结构体的指针::
|
||||||
unsigned long start_pfn;
|
unsigned long start_pfn;
|
||||||
unsigned long nr_pages;
|
unsigned long nr_pages;
|
||||||
int status_change_nid_normal;
|
int status_change_nid_normal;
|
||||||
int status_change_nid_high;
|
|
||||||
int status_change_nid;
|
int status_change_nid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,9 +73,6 @@ memory_notify结构体的指针::
|
||||||
- status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
|
- status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
|
||||||
点id,如果是-1,则nodemask状态不改变。
|
点id,如果是-1,则nodemask状态不改变。
|
||||||
|
|
||||||
- status_change_nid_high是当nodemask的N_HIGH_MEMORY被设置/清除时设置的节点
|
|
||||||
id,如果这个值为-1,那么nodemask状态不会改变。
|
|
||||||
|
|
||||||
- status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这
|
- status_change_nid是当nodemask的N_MEMORY被(将)设置/清除时设置的节点id。这
|
||||||
意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内
|
意味着一个新的(没上线的)节点通过联机获得新的内存,而一个节点失去了所有的内
|
||||||
存。如果这个值为-1,那么nodemask的状态就不会改变。
|
存。如果这个值为-1,那么nodemask的状态就不会改变。
|
||||||
|
|
|
@ -35,13 +35,17 @@ two parts:
|
||||||
1. Identification of the monitoring target address range for the address space.
|
1. Identification of the monitoring target address range for the address space.
|
||||||
2. Access check of specific address range in the target space.
|
2. Access check of specific address range in the target space.
|
||||||
|
|
||||||
DAMON currently provides the implementation of the primitives for only the
|
DAMON currently provides the implementations of the primitives for the physical
|
||||||
virtual address spaces. Below two subsections describe how it works.
|
and virtual address spaces. Below two subsections describe how those work.
|
||||||
|
|
||||||
|
|
||||||
VMA-based Target Address Range Construction
|
VMA-based Target Address Range Construction
|
||||||
-------------------------------------------
|
-------------------------------------------
|
||||||
|
|
||||||
|
This is only for the virtual address space primitives implementation. That for
|
||||||
|
the physical address space simply asks users to manually set the monitoring
|
||||||
|
target address ranges.
|
||||||
|
|
||||||
Only small parts in the super-huge virtual address space of the processes are
|
Only small parts in the super-huge virtual address space of the processes are
|
||||||
mapped to the physical memory and accessed. Thus, tracking the unmapped
|
mapped to the physical memory and accessed. Thus, tracking the unmapped
|
||||||
address regions is just wasteful. However, because DAMON can deal with some
|
address regions is just wasteful. However, because DAMON can deal with some
|
||||||
|
@ -71,15 +75,18 @@ to make a reasonable trade-off. Below shows this in detail::
|
||||||
PTE Accessed-bit Based Access Check
|
PTE Accessed-bit Based Access Check
|
||||||
-----------------------------------
|
-----------------------------------
|
||||||
|
|
||||||
The implementation for the virtual address space uses PTE Accessed-bit for
|
Both of the implementations for physical and virtual address spaces use PTE
|
||||||
basic access checks. It finds the relevant PTE Accessed bit from the address
|
Accessed-bit for basic access checks. Only one difference is the way of
|
||||||
by walking the page table for the target task of the address. In this way, the
|
finding the relevant PTE Accessed bit(s) from the address. While the
|
||||||
implementation finds and clears the bit for next sampling target address and
|
implementation for the virtual address walks the page table for the target task
|
||||||
checks whether the bit set again after one sampling period. This could disturb
|
of the address, the implementation for the physical address walks every page
|
||||||
other kernel subsystems using the Accessed bits, namely Idle page tracking and
|
table having a mapping to the address. In this way, the implementations find
|
||||||
the reclaim logic. To avoid such disturbances, DAMON makes it mutually
|
and clear the bit(s) for next sampling target address and checks whether the
|
||||||
exclusive with Idle page tracking and uses ``PG_idle`` and ``PG_young`` page
|
bit(s) set again after one sampling period. This could disturb other kernel
|
||||||
flags to solve the conflict with the reclaim logic, as Idle page tracking does.
|
subsystems using the Accessed bits, namely Idle page tracking and the reclaim
|
||||||
|
logic. To avoid such disturbances, DAMON makes it mutually exclusive with Idle
|
||||||
|
page tracking and uses ``PG_idle`` and ``PG_young`` page flags to solve the
|
||||||
|
conflict with the reclaim logic, as Idle page tracking does.
|
||||||
|
|
||||||
|
|
||||||
Address Space Independent Core Mechanisms
|
Address Space Independent Core Mechanisms
|
||||||
|
|
|
@ -36,10 +36,9 @@ constructions and actual access checks can be implemented and configured on the
|
||||||
DAMON core by the users. In this way, DAMON users can monitor any address
|
DAMON core by the users. In this way, DAMON users can monitor any address
|
||||||
space with any access check technique.
|
space with any access check technique.
|
||||||
|
|
||||||
Nonetheless, DAMON provides vma tracking and PTE Accessed bit check based
|
Nonetheless, DAMON provides vma/rmap tracking and PTE Accessed bit check based
|
||||||
implementations of the address space dependent functions for the virtual memory
|
implementations of the address space dependent functions for the virtual memory
|
||||||
by default, for a reference and convenient use. In near future, we will
|
and the physical memory by default, for a reference and convenient use.
|
||||||
provide those for physical memory address space.
|
|
||||||
|
|
||||||
|
|
||||||
Can I simply monitor page granularity?
|
Can I simply monitor page granularity?
|
||||||
|
|
|
@ -27,4 +27,3 @@ workloads and systems.
|
||||||
faq
|
faq
|
||||||
design
|
design
|
||||||
api
|
api
|
||||||
plans
|
|
||||||
|
|
|
@ -3,27 +3,11 @@ Linux Memory Management Documentation
|
||||||
=====================================
|
=====================================
|
||||||
|
|
||||||
This is a collection of documents about the Linux memory management (mm)
|
This is a collection of documents about the Linux memory management (mm)
|
||||||
subsystem. If you are looking for advice on simply allocating memory,
|
subsystem internals with different level of details ranging from notes and
|
||||||
see the :ref:`memory_allocation`.
|
mailing list responses for elaborating descriptions of data structures and
|
||||||
|
algorithms. If you are looking for advice on simply allocating memory, see the
|
||||||
User guides for MM features
|
:ref:`memory_allocation`. For controlling and tuning guides, see the
|
||||||
===========================
|
:doc:`admin guide <../admin-guide/mm/index>`.
|
||||||
|
|
||||||
The following documents provide guides for controlling and tuning
|
|
||||||
various features of the Linux memory management
|
|
||||||
|
|
||||||
.. toctree::
|
|
||||||
:maxdepth: 1
|
|
||||||
|
|
||||||
swap_numa
|
|
||||||
zswap
|
|
||||||
|
|
||||||
Kernel developers MM documentation
|
|
||||||
==================================
|
|
||||||
|
|
||||||
The below documents describe MM internals with different level of
|
|
||||||
details ranging from notes and mailing list responses to elaborate
|
|
||||||
descriptions of data structures and algorithms.
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 1
|
:maxdepth: 1
|
||||||
|
|
|
@ -85,5 +85,26 @@ Usage
|
||||||
cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
||||||
./page_owner_sort page_owner_full.txt sorted_page_owner.txt
|
./page_owner_sort page_owner_full.txt sorted_page_owner.txt
|
||||||
|
|
||||||
|
The general output of ``page_owner_full.txt`` is as follows:
|
||||||
|
|
||||||
|
Page allocated via order XXX, ...
|
||||||
|
PFN XXX ...
|
||||||
|
// Detailed stack
|
||||||
|
|
||||||
|
Page allocated via order XXX, ...
|
||||||
|
PFN XXX ...
|
||||||
|
// Detailed stack
|
||||||
|
|
||||||
|
The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows
|
||||||
|
in buf, uses regexp to extract the page order value, counts the times
|
||||||
|
and pages of buf, and finally sorts them according to the times.
|
||||||
|
|
||||||
See the result about who allocated each page
|
See the result about who allocated each page
|
||||||
in the ``sorted_page_owner.txt``.
|
in the ``sorted_page_owner.txt``. General output:
|
||||||
|
|
||||||
|
XXX times, XXX pages:
|
||||||
|
Page allocated via order XXX, ...
|
||||||
|
// Detailed stack
|
||||||
|
|
||||||
|
By default, ``page_owner_sort`` is sorted according to the times of buf.
|
||||||
|
If you want to sort by the pages nums of buf, use the ``-m`` parameter.
|
||||||
|
|
|
@ -5220,7 +5220,7 @@ F: net/ax25/ax25_timer.c
|
||||||
F: net/ax25/sysctl_net_ax25.c
|
F: net/ax25/sysctl_net_ax25.c
|
||||||
|
|
||||||
DATA ACCESS MONITOR
|
DATA ACCESS MONITOR
|
||||||
M: SeongJae Park <sjpark@amazon.de>
|
M: SeongJae Park <sj@kernel.org>
|
||||||
L: linux-mm@kvack.org
|
L: linux-mm@kvack.org
|
||||||
S: Maintained
|
S: Maintained
|
||||||
F: Documentation/admin-guide/mm/damon/
|
F: Documentation/admin-guide/mm/damon/
|
||||||
|
|
15
Makefile
15
Makefile
|
@ -1011,6 +1011,21 @@ ifdef CONFIG_CC_IS_GCC
|
||||||
KBUILD_CFLAGS += -Wno-maybe-uninitialized
|
KBUILD_CFLAGS += -Wno-maybe-uninitialized
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef CONFIG_CC_IS_GCC
|
||||||
|
# The allocators already balk at large sizes, so silence the compiler
|
||||||
|
# warnings for bounds checks involving those possible values. While
|
||||||
|
# -Wno-alloc-size-larger-than would normally be used here, earlier versions
|
||||||
|
# of gcc (<9.1) weirdly don't handle the option correctly when _other_
|
||||||
|
# warnings are produced (?!). Using -Walloc-size-larger-than=SIZE_MAX
|
||||||
|
# doesn't work (as it is documented to), silently resolving to "0" prior to
|
||||||
|
# version 9.1 (and producing an error more recently). Numeric values larger
|
||||||
|
# than PTRDIFF_MAX also don't work prior to version 9.1, which are silently
|
||||||
|
# ignored, continuing to default to PTRDIFF_MAX. So, left with no other
|
||||||
|
# choice, we must perform a versioned check to disable this warning.
|
||||||
|
# https://lore.kernel.org/lkml/20210824115859.187f272f@canb.auug.org.au
|
||||||
|
KBUILD_CFLAGS += $(call cc-ifversion, -ge, 0901, -Wno-alloc-size-larger-than)
|
||||||
|
endif
|
||||||
|
|
||||||
# disable invalid "can't wrap" optimizations for signed / pointers
|
# disable invalid "can't wrap" optimizations for signed / pointers
|
||||||
KBUILD_CFLAGS += -fno-strict-overflow
|
KBUILD_CFLAGS += -fno-strict-overflow
|
||||||
|
|
||||||
|
|
|
@ -233,7 +233,7 @@ albacore_init_arch(void)
|
||||||
unsigned long size;
|
unsigned long size;
|
||||||
|
|
||||||
size = initrd_end - initrd_start;
|
size = initrd_end - initrd_start;
|
||||||
memblock_free(__pa(initrd_start), PAGE_ALIGN(size));
|
memblock_free((void *)initrd_start, PAGE_ALIGN(size));
|
||||||
if (!move_initrd(pci_mem))
|
if (!move_initrd(pci_mem))
|
||||||
printk("irongate_init_arch: initrd too big "
|
printk("irongate_init_arch: initrd too big "
|
||||||
"(%ldK)\ndisabling initrd\n",
|
"(%ldK)\ndisabling initrd\n",
|
||||||
|
|
|
@ -59,13 +59,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
|
||||||
|
|
||||||
low_mem_sz = size;
|
low_mem_sz = size;
|
||||||
in_use = 1;
|
in_use = 1;
|
||||||
memblock_add_node(base, size, 0);
|
memblock_add_node(base, size, 0, MEMBLOCK_NONE);
|
||||||
} else {
|
} else {
|
||||||
#ifdef CONFIG_HIGHMEM
|
#ifdef CONFIG_HIGHMEM
|
||||||
high_mem_start = base;
|
high_mem_start = base;
|
||||||
high_mem_sz = size;
|
high_mem_sz = size;
|
||||||
in_use = 1;
|
in_use = 1;
|
||||||
memblock_add_node(base, size, 1);
|
memblock_add_node(base, size, 1, MEMBLOCK_NONE);
|
||||||
memblock_reserve(base, size);
|
memblock_reserve(base, size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -173,7 +173,7 @@ static void __init highmem_init(void)
|
||||||
#ifdef CONFIG_HIGHMEM
|
#ifdef CONFIG_HIGHMEM
|
||||||
unsigned long tmp;
|
unsigned long tmp;
|
||||||
|
|
||||||
memblock_free(high_mem_start, high_mem_sz);
|
memblock_phys_free(high_mem_start, high_mem_sz);
|
||||||
for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
|
for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
|
||||||
free_highmem_page(pfn_to_page(tmp));
|
free_highmem_page(pfn_to_page(tmp));
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -339,7 +339,7 @@ err_fabric:
|
||||||
err_sysctrl:
|
err_sysctrl:
|
||||||
iounmap(relocation);
|
iounmap(relocation);
|
||||||
err_reloc:
|
err_reloc:
|
||||||
memblock_free(hip04_boot_method[0], hip04_boot_method[1]);
|
memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]);
|
||||||
err:
|
err:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align)
|
||||||
panic("Failed to steal %pa bytes at %pS\n",
|
panic("Failed to steal %pa bytes at %pS\n",
|
||||||
&size, (void *)_RET_IP_);
|
&size, (void *)_RET_IP_);
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
|
|
||||||
return phys;
|
return phys;
|
||||||
|
|
|
@ -1163,6 +1163,10 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on NUMA
|
depends on NUMA
|
||||||
|
|
||||||
|
config NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||||
|
def_bool y
|
||||||
|
depends on NUMA
|
||||||
|
|
||||||
source "kernel/Kconfig.hz"
|
source "kernel/Kconfig.hz"
|
||||||
|
|
||||||
config ARCH_SPARSEMEM_ENABLE
|
config ARCH_SPARSEMEM_ENABLE
|
||||||
|
|
|
@ -287,6 +287,22 @@ static void __init kasan_init_depth(void)
|
||||||
init_task.kasan_depth = 0;
|
init_task.kasan_depth = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_KASAN_VMALLOC
|
||||||
|
void __init kasan_populate_early_vm_area_shadow(void *start, unsigned long size)
|
||||||
|
{
|
||||||
|
unsigned long shadow_start, shadow_end;
|
||||||
|
|
||||||
|
if (!is_vmalloc_or_module_addr(start))
|
||||||
|
return;
|
||||||
|
|
||||||
|
shadow_start = (unsigned long)kasan_mem_to_shadow(start);
|
||||||
|
shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
|
||||||
|
shadow_end = (unsigned long)kasan_mem_to_shadow(start + size);
|
||||||
|
shadow_end = ALIGN(shadow_end, PAGE_SIZE);
|
||||||
|
kasan_map_populate(shadow_start, shadow_end, NUMA_NO_NODE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void __init kasan_init(void)
|
void __init kasan_init(void)
|
||||||
{
|
{
|
||||||
kasan_init_shadow();
|
kasan_init_shadow();
|
||||||
|
|
|
@ -738,8 +738,8 @@ void __init paging_init(void)
|
||||||
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
|
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
|
||||||
init_mm.pgd = swapper_pg_dir;
|
init_mm.pgd = swapper_pg_dir;
|
||||||
|
|
||||||
memblock_free(__pa_symbol(init_pg_dir),
|
memblock_phys_free(__pa_symbol(init_pg_dir),
|
||||||
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
||||||
|
|
||||||
memblock_allow_resize();
|
memblock_allow_resize();
|
||||||
}
|
}
|
||||||
|
|
|
@ -153,7 +153,7 @@ find_memory (void)
|
||||||
efi_memmap_walk(find_max_min_low_pfn, NULL);
|
efi_memmap_walk(find_max_min_low_pfn, NULL);
|
||||||
max_pfn = max_low_pfn;
|
max_pfn = max_low_pfn;
|
||||||
|
|
||||||
memblock_add_node(0, PFN_PHYS(max_low_pfn), 0);
|
memblock_add_node(0, PFN_PHYS(max_low_pfn), 0, MEMBLOCK_NONE);
|
||||||
|
|
||||||
find_initrd();
|
find_initrd();
|
||||||
|
|
||||||
|
|
|
@ -378,7 +378,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (start < end)
|
if (start < end)
|
||||||
memblock_add_node(__pa(start), end - start, nid);
|
memblock_add_node(__pa(start), end - start, nid, MEMBLOCK_NONE);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -174,7 +174,8 @@ void __init cf_bootmem_alloc(void)
|
||||||
m68k_memory[0].addr = _rambase;
|
m68k_memory[0].addr = _rambase;
|
||||||
m68k_memory[0].size = _ramend - _rambase;
|
m68k_memory[0].size = _ramend - _rambase;
|
||||||
|
|
||||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
|
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
|
||||||
|
MEMBLOCK_NONE);
|
||||||
|
|
||||||
/* compute total pages in system */
|
/* compute total pages in system */
|
||||||
num_pages = PFN_DOWN(_ramend - _rambase);
|
num_pages = PFN_DOWN(_ramend - _rambase);
|
||||||
|
|
|
@ -410,7 +410,8 @@ void __init paging_init(void)
|
||||||
|
|
||||||
min_addr = m68k_memory[0].addr;
|
min_addr = m68k_memory[0].addr;
|
||||||
max_addr = min_addr + m68k_memory[0].size;
|
max_addr = min_addr + m68k_memory[0].size;
|
||||||
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0);
|
memblock_add_node(m68k_memory[0].addr, m68k_memory[0].size, 0,
|
||||||
|
MEMBLOCK_NONE);
|
||||||
for (i = 1; i < m68k_num_memory;) {
|
for (i = 1; i < m68k_num_memory;) {
|
||||||
if (m68k_memory[i].addr < min_addr) {
|
if (m68k_memory[i].addr < min_addr) {
|
||||||
printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n",
|
printk("Ignoring memory chunk at 0x%lx:0x%lx before the first chunk\n",
|
||||||
|
@ -421,7 +422,8 @@ void __init paging_init(void)
|
||||||
(m68k_num_memory - i) * sizeof(struct m68k_mem_info));
|
(m68k_num_memory - i) * sizeof(struct m68k_mem_info));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i);
|
memblock_add_node(m68k_memory[i].addr, m68k_memory[i].size, i,
|
||||||
|
MEMBLOCK_NONE);
|
||||||
addr = m68k_memory[i].addr + m68k_memory[i].size;
|
addr = m68k_memory[i].addr + m68k_memory[i].size;
|
||||||
if (addr > max_addr)
|
if (addr > max_addr)
|
||||||
max_addr = addr;
|
max_addr = addr;
|
||||||
|
|
|
@ -77,7 +77,9 @@ void __init szmem(unsigned int node)
|
||||||
(u32)node_id, mem_type, mem_start, mem_size);
|
(u32)node_id, mem_type, mem_start, mem_size);
|
||||||
pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
|
pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
|
||||||
start_pfn, end_pfn, num_physpages);
|
start_pfn, end_pfn, num_physpages);
|
||||||
memblock_add_node(PFN_PHYS(start_pfn), PFN_PHYS(node_psize), node);
|
memblock_add_node(PFN_PHYS(start_pfn),
|
||||||
|
PFN_PHYS(node_psize), node,
|
||||||
|
MEMBLOCK_NONE);
|
||||||
break;
|
break;
|
||||||
case SYSTEM_RAM_RESERVED:
|
case SYSTEM_RAM_RESERVED:
|
||||||
pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n",
|
pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx MB\n",
|
||||||
|
|
|
@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
|
||||||
|
|
||||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
memblock_free_early(__pa(ptr), size);
|
memblock_free(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init setup_per_cpu_areas(void)
|
void __init setup_per_cpu_areas(void)
|
||||||
|
|
|
@ -341,7 +341,8 @@ static void __init szmem(void)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
|
memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
|
||||||
PFN_PHYS(slot_psize), node);
|
PFN_PHYS(slot_psize), node,
|
||||||
|
MEMBLOCK_NONE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,10 +69,10 @@ static void __init ip30_mem_init(void)
|
||||||
total_mem += size;
|
total_mem += size;
|
||||||
|
|
||||||
if (addr >= IP30_REAL_MEMORY_START)
|
if (addr >= IP30_REAL_MEMORY_START)
|
||||||
memblock_free(addr, size);
|
memblock_phys_free(addr, size);
|
||||||
else if ((addr + size) > IP30_REAL_MEMORY_START)
|
else if ((addr + size) > IP30_REAL_MEMORY_START)
|
||||||
memblock_free(IP30_REAL_MEMORY_START,
|
memblock_phys_free(IP30_REAL_MEMORY_START,
|
||||||
size - IP30_MAX_PROM_MEMORY);
|
size - IP30_MAX_PROM_MEMORY);
|
||||||
}
|
}
|
||||||
pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem));
|
pr_info("Detected %luMB of physical memory.\n", MEM_SHIFT(total_mem));
|
||||||
}
|
}
|
||||||
|
|
|
@ -274,7 +274,6 @@ CONFIG_NLS_UTF8=y
|
||||||
CONFIG_ENCRYPTED_KEYS=y
|
CONFIG_ENCRYPTED_KEYS=y
|
||||||
CONFIG_SECURITY=y
|
CONFIG_SECURITY=y
|
||||||
CONFIG_HARDENED_USERCOPY=y
|
CONFIG_HARDENED_USERCOPY=y
|
||||||
# CONFIG_HARDENED_USERCOPY_FALLBACK is not set
|
|
||||||
CONFIG_HARDENED_USERCOPY_PAGESPAN=y
|
CONFIG_HARDENED_USERCOPY_PAGESPAN=y
|
||||||
CONFIG_FORTIFY_SOURCE=y
|
CONFIG_FORTIFY_SOURCE=y
|
||||||
CONFIG_SECURITY_LOCKDOWN_LSM=y
|
CONFIG_SECURITY_LOCKDOWN_LSM=y
|
||||||
|
|
|
@ -31,7 +31,7 @@ struct machdep_calls {
|
||||||
#ifdef CONFIG_PM
|
#ifdef CONFIG_PM
|
||||||
void (*iommu_restore)(void);
|
void (*iommu_restore)(void);
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
unsigned long (*memory_block_size)(void);
|
unsigned long (*memory_block_size)(void);
|
||||||
#endif
|
#endif
|
||||||
#endif /* CONFIG_PPC64 */
|
#endif /* CONFIG_PPC64 */
|
||||||
|
|
|
@ -6,21 +6,8 @@
|
||||||
#include <linux/elf.h>
|
#include <linux/elf.h>
|
||||||
#include <linux/uaccess.h>
|
#include <linux/uaccess.h>
|
||||||
|
|
||||||
#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
|
|
||||||
|
|
||||||
#include <asm-generic/sections.h>
|
#include <asm-generic/sections.h>
|
||||||
|
|
||||||
extern bool init_mem_is_free;
|
|
||||||
|
|
||||||
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
|
|
||||||
{
|
|
||||||
if (!init_mem_is_free)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return addr >= (unsigned long)__init_begin &&
|
|
||||||
addr < (unsigned long)__init_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
extern char __head_end[];
|
extern char __head_end[];
|
||||||
|
|
||||||
#ifdef __powerpc64__
|
#ifdef __powerpc64__
|
||||||
|
|
|
@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char
|
||||||
|
|
||||||
cpufeatures_setup_finished();
|
cpufeatures_setup_finished();
|
||||||
|
|
||||||
memblock_free(__pa(dt_cpu_features),
|
memblock_free(dt_cpu_features,
|
||||||
sizeof(struct dt_cpu_feature)*nr_dt_cpu_features);
|
sizeof(struct dt_cpu_feature) * nr_dt_cpu_features);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -322,8 +322,8 @@ void __init free_unused_pacas(void)
|
||||||
|
|
||||||
new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
|
new_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids;
|
||||||
if (new_ptrs_size < paca_ptrs_size)
|
if (new_ptrs_size < paca_ptrs_size)
|
||||||
memblock_free(__pa(paca_ptrs) + new_ptrs_size,
|
memblock_phys_free(__pa(paca_ptrs) + new_ptrs_size,
|
||||||
paca_ptrs_size - new_ptrs_size);
|
paca_ptrs_size - new_ptrs_size);
|
||||||
|
|
||||||
paca_nr_cpu_ids = nr_cpu_ids;
|
paca_nr_cpu_ids = nr_cpu_ids;
|
||||||
paca_ptrs_size = new_ptrs_size;
|
paca_ptrs_size = new_ptrs_size;
|
||||||
|
@ -331,8 +331,8 @@ void __init free_unused_pacas(void)
|
||||||
#ifdef CONFIG_PPC_BOOK3S_64
|
#ifdef CONFIG_PPC_BOOK3S_64
|
||||||
if (early_radix_enabled()) {
|
if (early_radix_enabled()) {
|
||||||
/* Ugly fixup, see new_slb_shadow() */
|
/* Ugly fixup, see new_slb_shadow() */
|
||||||
memblock_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
|
memblock_phys_free(__pa(paca_ptrs[boot_cpuid]->slb_shadow_ptr),
|
||||||
sizeof(struct slb_shadow));
|
sizeof(struct slb_shadow));
|
||||||
paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
|
paca_ptrs[boot_cpuid]->slb_shadow_ptr = NULL;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -822,7 +822,7 @@ static void __init smp_setup_pacas(void)
|
||||||
set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
|
set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]);
|
||||||
}
|
}
|
||||||
|
|
||||||
memblock_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32));
|
memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32));
|
||||||
cpu_to_phys_id = NULL;
|
cpu_to_phys_id = NULL;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
|
||||||
|
|
||||||
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
memblock_free(__pa(ptr), size);
|
memblock_free(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
|
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||||
|
@ -912,7 +912,7 @@ void __init setup_per_cpu_areas(void)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
unsigned long memory_block_size_bytes(void)
|
unsigned long memory_block_size_bytes(void)
|
||||||
{
|
{
|
||||||
if (ppc_md.memory_block_size)
|
if (ppc_md.memory_block_size)
|
||||||
|
|
|
@ -229,17 +229,22 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
|
||||||
m->hstate = hstate;
|
m->hstate = hstate;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool __init hugetlb_node_alloc_supported(void)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
int __init alloc_bootmem_huge_page(struct hstate *h)
|
int __init alloc_bootmem_huge_page(struct hstate *h, int nid)
|
||||||
{
|
{
|
||||||
|
|
||||||
#ifdef CONFIG_PPC_BOOK3S_64
|
#ifdef CONFIG_PPC_BOOK3S_64
|
||||||
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
|
if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled())
|
||||||
return pseries_alloc_bootmem_huge_page(h);
|
return pseries_alloc_bootmem_huge_page(h);
|
||||||
#endif
|
#endif
|
||||||
return __alloc_bootmem_huge_page(h);
|
return __alloc_bootmem_huge_page(h, nid);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef CONFIG_PPC_BOOK3S_64
|
#ifndef CONFIG_PPC_BOOK3S_64
|
||||||
|
|
|
@ -2981,7 +2981,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
|
||||||
if (!phb->hose) {
|
if (!phb->hose) {
|
||||||
pr_err(" Can't allocate PCI controller for %pOF\n",
|
pr_err(" Can't allocate PCI controller for %pOF\n",
|
||||||
np);
|
np);
|
||||||
memblock_free(__pa(phb), sizeof(struct pnv_phb));
|
memblock_free(phb, sizeof(struct pnv_phb));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -440,7 +440,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_KEXEC_CORE */
|
#endif /* CONFIG_KEXEC_CORE */
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
static unsigned long pnv_memory_block_size(void)
|
static unsigned long pnv_memory_block_size(void)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -553,7 +553,7 @@ define_machine(powernv) {
|
||||||
#ifdef CONFIG_KEXEC_CORE
|
#ifdef CONFIG_KEXEC_CORE
|
||||||
.kexec_cpu_down = pnv_kexec_cpu_down,
|
.kexec_cpu_down = pnv_kexec_cpu_down,
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
.memory_block_size = pnv_memory_block_size,
|
.memory_block_size = pnv_memory_block_size,
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
|
@ -1088,7 +1088,7 @@ define_machine(pseries) {
|
||||||
.machine_kexec = pSeries_machine_kexec,
|
.machine_kexec = pSeries_machine_kexec,
|
||||||
.kexec_cpu_down = pseries_kexec_cpu_down,
|
.kexec_cpu_down = pseries_kexec_cpu_down,
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
.memory_block_size = pseries_memory_block_size,
|
.memory_block_size = pseries_memory_block_size,
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
|
@ -57,8 +57,7 @@ void __init svm_swiotlb_init(void)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
||||||
memblock_free_early(__pa(vstart),
|
memblock_free(vstart, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
|
||||||
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
|
|
||||||
panic("SVM: Cannot allocate SWIOTLB buffer");
|
panic("SVM: Cannot allocate SWIOTLB buffer");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -230,13 +230,13 @@ static void __init init_resources(void)
|
||||||
|
|
||||||
/* Clean-up any unused pre-allocated resources */
|
/* Clean-up any unused pre-allocated resources */
|
||||||
if (res_idx >= 0)
|
if (res_idx >= 0)
|
||||||
memblock_free(__pa(mem_res), (res_idx + 1) * sizeof(*mem_res));
|
memblock_free(mem_res, (res_idx + 1) * sizeof(*mem_res));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
/* Better an empty resource tree than an inconsistent one */
|
/* Better an empty resource tree than an inconsistent one */
|
||||||
release_child_resources(&iomem_resource);
|
release_child_resources(&iomem_resource);
|
||||||
memblock_free(__pa(mem_res), mem_res_sz);
|
memblock_free(mem_res, mem_res_sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,20 +2,8 @@
|
||||||
#ifndef _S390_SECTIONS_H
|
#ifndef _S390_SECTIONS_H
|
||||||
#define _S390_SECTIONS_H
|
#define _S390_SECTIONS_H
|
||||||
|
|
||||||
#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
|
|
||||||
|
|
||||||
#include <asm-generic/sections.h>
|
#include <asm-generic/sections.h>
|
||||||
|
|
||||||
extern bool initmem_freed;
|
|
||||||
|
|
||||||
static inline int arch_is_kernel_initmem_freed(unsigned long addr)
|
|
||||||
{
|
|
||||||
if (!initmem_freed)
|
|
||||||
return 0;
|
|
||||||
return addr >= (unsigned long)__init_begin &&
|
|
||||||
addr < (unsigned long)__init_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* .boot.data section contains variables "shared" between the decompressor and
|
* .boot.data section contains variables "shared" between the decompressor and
|
||||||
* the decompressed kernel. The decompressor will store values in them, and
|
* the decompressed kernel. The decompressor will store values in them, and
|
||||||
|
|
|
@ -593,7 +593,8 @@ static void __init setup_resources(void)
|
||||||
* part of the System RAM resource.
|
* part of the System RAM resource.
|
||||||
*/
|
*/
|
||||||
if (crashk_res.end) {
|
if (crashk_res.end) {
|
||||||
memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0);
|
memblock_add_node(crashk_res.start, resource_size(&crashk_res),
|
||||||
|
0, MEMBLOCK_NONE);
|
||||||
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
|
memblock_reserve(crashk_res.start, resource_size(&crashk_res));
|
||||||
insert_resource(&iomem_resource, &crashk_res);
|
insert_resource(&iomem_resource, &crashk_res);
|
||||||
}
|
}
|
||||||
|
@ -693,7 +694,7 @@ static void __init reserve_crashkernel(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (register_memory_notifier(&kdump_mem_nb)) {
|
if (register_memory_notifier(&kdump_mem_nb)) {
|
||||||
memblock_free(crash_base, crash_size);
|
memblock_phys_free(crash_base, crash_size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -748,7 +749,7 @@ static void __init free_mem_detect_info(void)
|
||||||
|
|
||||||
get_mem_detect_reserved(&start, &size);
|
get_mem_detect_reserved(&start, &size);
|
||||||
if (size)
|
if (size)
|
||||||
memblock_free(start, size);
|
memblock_phys_free(start, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * __init get_mem_info_source(void)
|
static const char * __init get_mem_info_source(void)
|
||||||
|
@ -793,7 +794,7 @@ static void __init check_initrd(void)
|
||||||
if (initrd_data.start && initrd_data.size &&
|
if (initrd_data.start && initrd_data.size &&
|
||||||
!memblock_is_region_memory(initrd_data.start, initrd_data.size)) {
|
!memblock_is_region_memory(initrd_data.start, initrd_data.size)) {
|
||||||
pr_err("The initial RAM disk does not fit into the memory\n");
|
pr_err("The initial RAM disk does not fit into the memory\n");
|
||||||
memblock_free(initrd_data.start, initrd_data.size);
|
memblock_phys_free(initrd_data.start, initrd_data.size);
|
||||||
initrd_start = initrd_end = 0;
|
initrd_start = initrd_end = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -890,7 +891,7 @@ static void __init setup_randomness(void)
|
||||||
|
|
||||||
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
|
if (stsi(vmms, 3, 2, 2) == 0 && vmms->count)
|
||||||
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
|
add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count);
|
||||||
memblock_free((unsigned long) vmms, PAGE_SIZE);
|
memblock_phys_free((unsigned long)vmms, PAGE_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void)
|
||||||
/* Get the CPU registers */
|
/* Get the CPU registers */
|
||||||
smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
|
smp_save_cpu_regs(sa, addr, is_boot_cpu, page);
|
||||||
}
|
}
|
||||||
memblock_free(page, PAGE_SIZE);
|
memblock_phys_free(page, PAGE_SIZE);
|
||||||
diag_amode31_ops.diag308_reset();
|
diag_amode31_ops.diag308_reset();
|
||||||
pcpu_set_smt(0);
|
pcpu_set_smt(0);
|
||||||
}
|
}
|
||||||
|
@ -880,7 +880,7 @@ void __init smp_detect_cpus(void)
|
||||||
|
|
||||||
/* Add CPUs present at boot */
|
/* Add CPUs present at boot */
|
||||||
__smp_rescan_cpus(info, true);
|
__smp_rescan_cpus(info, true);
|
||||||
memblock_free_early((unsigned long)info, sizeof(*info));
|
memblock_phys_free((unsigned long)info, sizeof(*info));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -64,7 +64,7 @@ void __init setup_uv(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) {
|
if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) {
|
||||||
memblock_free(uv_stor_base, uv_info.uv_base_stor_len);
|
memblock_phys_free(uv_stor_base, uv_info.uv_base_stor_len);
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask;
|
||||||
EXPORT_SYMBOL(empty_zero_page);
|
EXPORT_SYMBOL(empty_zero_page);
|
||||||
EXPORT_SYMBOL(zero_page_mask);
|
EXPORT_SYMBOL(zero_page_mask);
|
||||||
|
|
||||||
bool initmem_freed;
|
|
||||||
|
|
||||||
static void __init setup_zero_pages(void)
|
static void __init setup_zero_pages(void)
|
||||||
{
|
{
|
||||||
unsigned int order;
|
unsigned int order;
|
||||||
|
@ -214,7 +212,6 @@ void __init mem_init(void)
|
||||||
|
|
||||||
void free_initmem(void)
|
void free_initmem(void)
|
||||||
{
|
{
|
||||||
initmem_freed = true;
|
|
||||||
__set_memory((unsigned long)_sinittext,
|
__set_memory((unsigned long)_sinittext,
|
||||||
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
|
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
|
||||||
SET_MEMORY_RW | SET_MEMORY_NX);
|
SET_MEMORY_RW | SET_MEMORY_NX);
|
||||||
|
|
|
@ -399,5 +399,5 @@ void __init kasan_copy_shadow_mapping(void)
|
||||||
|
|
||||||
void __init kasan_free_early_identity(void)
|
void __init kasan_free_early_identity(void)
|
||||||
{
|
{
|
||||||
memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
|
memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
|
||||||
}
|
}
|
||||||
|
|
|
@ -560,7 +560,7 @@ static void __init ap325rxa_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU memory\n");
|
panic("Failed to allocate CEU memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
|
|
||||||
ceu_dma_membase = phys;
|
ceu_dma_membase = phys;
|
||||||
|
|
|
@ -1502,7 +1502,7 @@ static void __init ecovec_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU0 memory\n");
|
panic("Failed to allocate CEU0 memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
ceu0_dma_membase = phys;
|
ceu0_dma_membase = phys;
|
||||||
|
|
||||||
|
@ -1510,7 +1510,7 @@ static void __init ecovec_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU1 memory\n");
|
panic("Failed to allocate CEU1 memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
ceu1_dma_membase = phys;
|
ceu1_dma_membase = phys;
|
||||||
}
|
}
|
||||||
|
|
|
@ -633,7 +633,7 @@ static void __init kfr2r09_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU memory\n");
|
panic("Failed to allocate CEU memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
|
|
||||||
ceu_dma_membase = phys;
|
ceu_dma_membase = phys;
|
||||||
|
|
|
@ -633,7 +633,7 @@ static void __init migor_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU memory\n");
|
panic("Failed to allocate CEU memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
|
|
||||||
ceu_dma_membase = phys;
|
ceu_dma_membase = phys;
|
||||||
|
|
|
@ -966,7 +966,7 @@ static void __init ms7724se_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU0 memory\n");
|
panic("Failed to allocate CEU0 memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
ceu0_dma_membase = phys;
|
ceu0_dma_membase = phys;
|
||||||
|
|
||||||
|
@ -974,7 +974,7 @@ static void __init ms7724se_mv_mem_reserve(void)
|
||||||
if (!phys)
|
if (!phys)
|
||||||
panic("Failed to allocate CEU1 memory\n");
|
panic("Failed to allocate CEU1 memory\n");
|
||||||
|
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
memblock_remove(phys, size);
|
memblock_remove(phys, size);
|
||||||
ceu1_dma_membase = phys;
|
ceu1_dma_membase = phys;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1567,7 +1567,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size,
|
||||||
|
|
||||||
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
static void __init pcpu_free_bootmem(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
memblock_free(__pa(ptr), size);
|
memblock_free(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||||
|
|
|
@ -47,7 +47,7 @@ void __init mem_init(void)
|
||||||
*/
|
*/
|
||||||
brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
|
brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
|
||||||
map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
|
map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
|
||||||
memblock_free(__pa(brk_end), uml_reserved - brk_end);
|
memblock_free((void *)brk_end, uml_reserved - brk_end);
|
||||||
uml_reserved = brk_end;
|
uml_reserved = brk_end;
|
||||||
|
|
||||||
/* this will put all low memory onto the freelists */
|
/* this will put all low memory onto the freelists */
|
||||||
|
|
|
@ -63,7 +63,7 @@ config X86
|
||||||
select ARCH_CLOCKSOURCE_INIT
|
select ARCH_CLOCKSOURCE_INIT
|
||||||
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
|
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
|
||||||
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
|
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
|
||||||
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
|
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
|
||||||
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
|
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
|
||||||
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
|
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
|
||||||
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
|
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
|
||||||
|
@ -1627,7 +1627,7 @@ config ARCH_SELECT_MEMORY_MODEL
|
||||||
|
|
||||||
config ARCH_MEMORY_PROBE
|
config ARCH_MEMORY_PROBE
|
||||||
bool "Enable sysfs memory/probe interface"
|
bool "Enable sysfs memory/probe interface"
|
||||||
depends on X86_64 && MEMORY_HOTPLUG
|
depends on MEMORY_HOTPLUG
|
||||||
help
|
help
|
||||||
This option enables a sysfs memory/probe interface for testing.
|
This option enables a sysfs memory/probe interface for testing.
|
||||||
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
|
See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
|
||||||
|
@ -2423,7 +2423,7 @@ endmenu
|
||||||
|
|
||||||
config ARCH_HAS_ADD_PAGES
|
config ARCH_HAS_ADD_PAGES
|
||||||
def_bool y
|
def_bool y
|
||||||
depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
|
depends on ARCH_ENABLE_MEMORY_HOTPLUG
|
||||||
|
|
||||||
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
|
||||||
def_bool y
|
def_bool y
|
||||||
|
|
|
@ -322,7 +322,7 @@ static void __init reserve_initrd(void)
|
||||||
|
|
||||||
relocate_initrd();
|
relocate_initrd();
|
||||||
|
|
||||||
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
|
memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
|
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
|
||||||
memblock_free(crash_base, crash_size);
|
memblock_phys_free(crash_base, crash_size);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
|
||||||
|
|
||||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
memblock_free_ptr(ptr, size);
|
memblock_free(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
|
||||||
|
|
|
@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start,
|
||||||
*/
|
*/
|
||||||
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
|
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
|
||||||
map_end);
|
map_end);
|
||||||
memblock_free(addr, PMD_SIZE);
|
memblock_phys_free(addr, PMD_SIZE);
|
||||||
real_end = addr + PMD_SIZE;
|
real_end = addr + PMD_SIZE;
|
||||||
|
|
||||||
/* step_size need to be small so pgt_buf from BRK could cover it */
|
/* step_size need to be small so pgt_buf from BRK could cover it */
|
||||||
|
|
|
@ -779,37 +779,6 @@ void __init mem_init(void)
|
||||||
test_wp_bit();
|
test_wp_bit();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
||||||
int arch_add_memory(int nid, u64 start, u64 size,
|
|
||||||
struct mhp_params *params)
|
|
||||||
{
|
|
||||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
||||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The page tables were already mapped at boot so if the caller
|
|
||||||
* requests a different mapping type then we must change all the
|
|
||||||
* pages with __set_memory_prot().
|
|
||||||
*/
|
|
||||||
if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) {
|
|
||||||
ret = __set_memory_prot(start, nr_pages, params->pgprot);
|
|
||||||
if (ret)
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
return __add_pages(nid, start_pfn, nr_pages, params);
|
|
||||||
}
|
|
||||||
|
|
||||||
void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
|
|
||||||
{
|
|
||||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
||||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
||||||
|
|
||||||
__remove_pages(start_pfn, nr_pages, altmap);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int kernel_set_to_readonly __read_mostly;
|
int kernel_set_to_readonly __read_mostly;
|
||||||
|
|
||||||
static void mark_nxdata_nx(void)
|
static void mark_nxdata_nx(void)
|
||||||
|
|
|
@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
|
||||||
p = early_alloc(PMD_SIZE, nid, false);
|
p = early_alloc(PMD_SIZE, nid, false);
|
||||||
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
|
if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
|
||||||
return;
|
return;
|
||||||
memblock_free_ptr(p, PMD_SIZE);
|
memblock_free(p, PMD_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
p = early_alloc(PAGE_SIZE, nid, true);
|
p = early_alloc(PAGE_SIZE, nid, true);
|
||||||
|
@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
|
||||||
p = early_alloc(PUD_SIZE, nid, false);
|
p = early_alloc(PUD_SIZE, nid, false);
|
||||||
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
|
if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
|
||||||
return;
|
return;
|
||||||
memblock_free_ptr(p, PUD_SIZE);
|
memblock_free(p, PUD_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
p = early_alloc(PAGE_SIZE, nid, true);
|
p = early_alloc(PAGE_SIZE, nid, true);
|
||||||
|
|
|
@ -355,7 +355,7 @@ void __init numa_reset_distance(void)
|
||||||
|
|
||||||
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
||||||
if (numa_distance_cnt)
|
if (numa_distance_cnt)
|
||||||
memblock_free_ptr(numa_distance, size);
|
memblock_free(numa_distance, size);
|
||||||
numa_distance_cnt = 0;
|
numa_distance_cnt = 0;
|
||||||
numa_distance = NULL; /* enable table creation */
|
numa_distance = NULL; /* enable table creation */
|
||||||
}
|
}
|
||||||
|
|
|
@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* free the copied physical distance table */
|
/* free the copied physical distance table */
|
||||||
memblock_free_ptr(phys_dist, phys_size);
|
memblock_free(phys_dist, phys_size);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
no_emu:
|
no_emu:
|
||||||
|
|
|
@ -1025,7 +1025,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
|
||||||
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
|
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
|
||||||
make_lowmem_page_readwrite(vaddr);
|
make_lowmem_page_readwrite(vaddr);
|
||||||
|
|
||||||
memblock_free(paddr, size);
|
memblock_phys_free(paddr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
|
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
|
||||||
|
@ -1151,7 +1151,7 @@ static void __init xen_pagetable_p2m_free(void)
|
||||||
xen_cleanhighmap(addr, addr + size);
|
xen_cleanhighmap(addr, addr + size);
|
||||||
size = PAGE_ALIGN(xen_start_info->nr_pages *
|
size = PAGE_ALIGN(xen_start_info->nr_pages *
|
||||||
sizeof(unsigned long));
|
sizeof(unsigned long));
|
||||||
memblock_free(__pa(addr), size);
|
memblock_free((void *)addr, size);
|
||||||
} else {
|
} else {
|
||||||
xen_cleanmfnmap(addr);
|
xen_cleanmfnmap(addr);
|
||||||
}
|
}
|
||||||
|
@ -1956,7 +1956,7 @@ void __init xen_relocate_p2m(void)
|
||||||
pfn_end = p2m_pfn_end;
|
pfn_end = p2m_pfn_end;
|
||||||
}
|
}
|
||||||
|
|
||||||
memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
|
memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
|
||||||
while (pfn < pfn_end) {
|
while (pfn < pfn_end) {
|
||||||
if (pfn == p2m_pfn) {
|
if (pfn == p2m_pfn) {
|
||||||
pfn = p2m_pfn_end;
|
pfn = p2m_pfn_end;
|
||||||
|
|
|
@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void)
|
||||||
static void __ref free_p2m_page(void *p)
|
static void __ref free_p2m_page(void *p)
|
||||||
{
|
{
|
||||||
if (unlikely(!slab_is_available())) {
|
if (unlikely(!slab_is_available())) {
|
||||||
memblock_free((unsigned long)p, PAGE_SIZE);
|
memblock_free(p, PAGE_SIZE);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
|
memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
xen_relocate_p2m();
|
xen_relocate_p2m();
|
||||||
memblock_free(start, size);
|
memblock_phys_free(start, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -885,7 +885,7 @@ char * __init xen_memory_setup(void)
|
||||||
xen_phys_memcpy(new_area, start, size);
|
xen_phys_memcpy(new_area, start, size);
|
||||||
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
|
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
|
||||||
start, start + size, new_area, new_area + size);
|
start, start + size, new_area, new_area + size);
|
||||||
memblock_free(start, size);
|
memblock_phys_free(start, size);
|
||||||
boot_params.hdr.ramdisk_image = new_area;
|
boot_params.hdr.ramdisk_image = new_area;
|
||||||
boot_params.ext_ramdisk_image = new_area >> 32;
|
boot_params.ext_ramdisk_image = new_area >> 32;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ obj-y += power/
|
||||||
obj-$(CONFIG_ISA_BUS_API) += isa.o
|
obj-$(CONFIG_ISA_BUS_API) += isa.o
|
||||||
obj-y += firmware_loader/
|
obj-y += firmware_loader/
|
||||||
obj-$(CONFIG_NUMA) += node.o
|
obj-$(CONFIG_NUMA) += node.o
|
||||||
obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
|
obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
|
||||||
ifeq ($(CONFIG_SYSFS),y)
|
ifeq ($(CONFIG_SYSFS),y)
|
||||||
obj-$(CONFIG_MODULES) += module.o
|
obj-$(CONFIG_MODULES) += module.o
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
#include <linux/of.h>
|
#include <linux/of.h>
|
||||||
|
|
||||||
#include <asm/sections.h>
|
#include <asm/sections.h>
|
||||||
|
#include <asm/pgalloc.h>
|
||||||
|
|
||||||
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
||||||
EXPORT_SYMBOL(node_data);
|
EXPORT_SYMBOL(node_data);
|
||||||
|
@ -165,25 +166,86 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size,
|
||||||
|
|
||||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
memblock_free_early(__pa(ptr), size);
|
memblock_free(ptr, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||||
|
static void __init pcpu_populate_pte(unsigned long addr)
|
||||||
|
{
|
||||||
|
pgd_t *pgd = pgd_offset_k(addr);
|
||||||
|
p4d_t *p4d;
|
||||||
|
pud_t *pud;
|
||||||
|
pmd_t *pmd;
|
||||||
|
|
||||||
|
p4d = p4d_offset(pgd, addr);
|
||||||
|
if (p4d_none(*p4d)) {
|
||||||
|
pud_t *new;
|
||||||
|
|
||||||
|
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||||
|
if (!new)
|
||||||
|
goto err_alloc;
|
||||||
|
p4d_populate(&init_mm, p4d, new);
|
||||||
|
}
|
||||||
|
|
||||||
|
pud = pud_offset(p4d, addr);
|
||||||
|
if (pud_none(*pud)) {
|
||||||
|
pmd_t *new;
|
||||||
|
|
||||||
|
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||||
|
if (!new)
|
||||||
|
goto err_alloc;
|
||||||
|
pud_populate(&init_mm, pud, new);
|
||||||
|
}
|
||||||
|
|
||||||
|
pmd = pmd_offset(pud, addr);
|
||||||
|
if (!pmd_present(*pmd)) {
|
||||||
|
pte_t *new;
|
||||||
|
|
||||||
|
new = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
|
||||||
|
if (!new)
|
||||||
|
goto err_alloc;
|
||||||
|
pmd_populate_kernel(&init_mm, pmd, new);
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
err_alloc:
|
||||||
|
panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n",
|
||||||
|
__func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void __init setup_per_cpu_areas(void)
|
void __init setup_per_cpu_areas(void)
|
||||||
{
|
{
|
||||||
unsigned long delta;
|
unsigned long delta;
|
||||||
unsigned int cpu;
|
unsigned int cpu;
|
||||||
int rc;
|
int rc = -EINVAL;
|
||||||
|
|
||||||
/*
|
if (pcpu_chosen_fc != PCPU_FC_PAGE) {
|
||||||
* Always reserve area for module percpu variables. That's
|
/*
|
||||||
* what the legacy allocator did.
|
* Always reserve area for module percpu variables. That's
|
||||||
*/
|
* what the legacy allocator did.
|
||||||
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
*/
|
||||||
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
|
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
||||||
pcpu_cpu_distance,
|
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE,
|
||||||
pcpu_fc_alloc, pcpu_fc_free);
|
pcpu_cpu_distance,
|
||||||
|
pcpu_fc_alloc, pcpu_fc_free);
|
||||||
|
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||||
|
if (rc < 0)
|
||||||
|
pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n",
|
||||||
|
pcpu_fc_names[pcpu_chosen_fc], rc);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||||
if (rc < 0)
|
if (rc < 0)
|
||||||
panic("Failed to initialize percpu areas.");
|
rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE,
|
||||||
|
pcpu_fc_alloc,
|
||||||
|
pcpu_fc_free,
|
||||||
|
pcpu_populate_pte);
|
||||||
|
#endif
|
||||||
|
if (rc < 0)
|
||||||
|
panic("Failed to initialize percpu areas (err=%d).", rc);
|
||||||
|
|
||||||
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
|
||||||
for_each_possible_cpu(cpu)
|
for_each_possible_cpu(cpu)
|
||||||
|
@ -264,7 +326,7 @@ void __init numa_free_distance(void)
|
||||||
size = numa_distance_cnt * numa_distance_cnt *
|
size = numa_distance_cnt * numa_distance_cnt *
|
||||||
sizeof(numa_distance[0]);
|
sizeof(numa_distance[0]);
|
||||||
|
|
||||||
memblock_free_ptr(numa_distance, size);
|
memblock_free(numa_distance, size);
|
||||||
numa_distance_cnt = 0;
|
numa_distance_cnt = 0;
|
||||||
numa_distance = NULL;
|
numa_distance = NULL;
|
||||||
}
|
}
|
||||||
|
@ -275,15 +337,13 @@ void __init numa_free_distance(void)
|
||||||
static int __init numa_alloc_distance(void)
|
static int __init numa_alloc_distance(void)
|
||||||
{
|
{
|
||||||
size_t size;
|
size_t size;
|
||||||
u64 phys;
|
|
||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
|
size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]);
|
||||||
phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn));
|
numa_distance = memblock_alloc(size, PAGE_SIZE);
|
||||||
if (WARN_ON(!phys))
|
if (WARN_ON(!numa_distance))
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
numa_distance = __va(phys);
|
|
||||||
numa_distance_cnt = nr_node_ids;
|
numa_distance_cnt = nr_node_ids;
|
||||||
|
|
||||||
/* fill with the default distances */
|
/* fill with the default distances */
|
||||||
|
|
|
@ -629,7 +629,7 @@ static void node_device_release(struct device *dev)
|
||||||
{
|
{
|
||||||
struct node *node = to_node(dev);
|
struct node *node = to_node(dev);
|
||||||
|
|
||||||
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
|
#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS)
|
||||||
/*
|
/*
|
||||||
* We schedule the work only when a memory section is
|
* We schedule the work only when a memory section is
|
||||||
* onlined/offlined on this node. When we come here,
|
* onlined/offlined on this node. When we come here,
|
||||||
|
@ -782,7 +782,7 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||||
static int __ref get_nid_for_pfn(unsigned long pfn)
|
static int __ref get_nid_for_pfn(unsigned long pfn)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
||||||
|
@ -958,10 +958,9 @@ static int node_memory_callback(struct notifier_block *self,
|
||||||
return NOTIFY_OK;
|
return NOTIFY_OK;
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_HUGETLBFS */
|
#endif /* CONFIG_HUGETLBFS */
|
||||||
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||||
|
|
||||||
#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
|
#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS)
|
||||||
!defined(CONFIG_HUGETLBFS)
|
|
||||||
static inline int node_memory_callback(struct notifier_block *self,
|
static inline int node_memory_callback(struct notifier_block *self,
|
||||||
unsigned long action, void *arg)
|
unsigned long action, void *arg)
|
||||||
{
|
{
|
||||||
|
|
|
@ -291,22 +291,16 @@ static ssize_t mem_used_max_store(struct device *dev,
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ssize_t idle_store(struct device *dev,
|
/*
|
||||||
struct device_attribute *attr, const char *buf, size_t len)
|
* Mark all pages which are older than or equal to cutoff as IDLE.
|
||||||
|
* Callers should hold the zram init lock in read mode
|
||||||
|
*/
|
||||||
|
static void mark_idle(struct zram *zram, ktime_t cutoff)
|
||||||
{
|
{
|
||||||
struct zram *zram = dev_to_zram(dev);
|
int is_idle = 1;
|
||||||
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
|
unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
|
||||||
int index;
|
int index;
|
||||||
|
|
||||||
if (!sysfs_streq(buf, "all"))
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
|
||||||
if (!init_done(zram)) {
|
|
||||||
up_read(&zram->init_lock);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (index = 0; index < nr_pages; index++) {
|
for (index = 0; index < nr_pages; index++) {
|
||||||
/*
|
/*
|
||||||
* Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
|
* Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
|
||||||
|
@ -314,14 +308,50 @@ static ssize_t idle_store(struct device *dev,
|
||||||
*/
|
*/
|
||||||
zram_slot_lock(zram, index);
|
zram_slot_lock(zram, index);
|
||||||
if (zram_allocated(zram, index) &&
|
if (zram_allocated(zram, index) &&
|
||||||
!zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
!zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
|
||||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
|
||||||
|
is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
|
||||||
|
#endif
|
||||||
|
if (is_idle)
|
||||||
|
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||||
|
}
|
||||||
zram_slot_unlock(zram, index);
|
zram_slot_unlock(zram, index);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t idle_store(struct device *dev,
|
||||||
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
|
{
|
||||||
|
struct zram *zram = dev_to_zram(dev);
|
||||||
|
ktime_t cutoff_time = 0;
|
||||||
|
ssize_t rv = -EINVAL;
|
||||||
|
|
||||||
|
if (!sysfs_streq(buf, "all")) {
|
||||||
|
/*
|
||||||
|
* If it did not parse as 'all' try to treat it as an integer when
|
||||||
|
* we have memory tracking enabled.
|
||||||
|
*/
|
||||||
|
u64 age_sec;
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
|
||||||
|
cutoff_time = ktime_sub(ktime_get_boottime(),
|
||||||
|
ns_to_ktime(age_sec * NSEC_PER_SEC));
|
||||||
|
else
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
down_read(&zram->init_lock);
|
||||||
|
if (!init_done(zram))
|
||||||
|
goto out_unlock;
|
||||||
|
|
||||||
|
/* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */
|
||||||
|
mark_idle(zram, cutoff_time);
|
||||||
|
rv = len;
|
||||||
|
|
||||||
|
out_unlock:
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
out:
|
||||||
return len;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||||
|
@ -587,7 +617,7 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||||
{
|
{
|
||||||
struct bio *bio;
|
struct bio *bio;
|
||||||
|
|
||||||
bio = bio_alloc(GFP_ATOMIC, 1);
|
bio = bio_alloc(GFP_NOIO, 1);
|
||||||
if (!bio)
|
if (!bio)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
@ -910,7 +940,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf,
|
||||||
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
|
zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
|
||||||
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
|
zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
|
||||||
|
|
||||||
if (count < copied) {
|
if (count <= copied) {
|
||||||
zram_slot_unlock(zram, index);
|
zram_slot_unlock(zram, index);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
|
||||||
if (slab_is_available())
|
if (slab_is_available())
|
||||||
memblock_free_late(phys, size);
|
memblock_free_late(phys, size);
|
||||||
else
|
else
|
||||||
memblock_free(phys, size);
|
memblock_phys_free(phys, size);
|
||||||
} else if (flags & EFI_MEMMAP_SLAB) {
|
} else if (flags & EFI_MEMMAP_SLAB) {
|
||||||
struct page *p = pfn_to_page(PHYS_PFN(phys));
|
struct page *p = pfn_to_page(PHYS_PFN(phys));
|
||||||
unsigned int order = get_order(size);
|
unsigned int order = get_order(size);
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include <linux/device.h>
|
#include <linux/device.h>
|
||||||
#include <linux/errno.h>
|
#include <linux/errno.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
#include <linux/fsi-occ.h>
|
#include <linux/fsi-occ.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
|
|
|
@ -570,7 +570,7 @@ fail_msg_node:
|
||||||
fail_db_node:
|
fail_db_node:
|
||||||
of_node_put(smu->db_node);
|
of_node_put(smu->db_node);
|
||||||
fail_bootmem:
|
fail_bootmem:
|
||||||
memblock_free_ptr(smu, sizeof(struct smu_device));
|
memblock_free(smu, sizeof(struct smu_device));
|
||||||
smu = NULL;
|
smu = NULL;
|
||||||
fail_np:
|
fail_np:
|
||||||
of_node_put(np);
|
of_node_put(np);
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
|
||||||
#include <linux/scatterlist.h>
|
#include <linux/scatterlist.h>
|
||||||
#include <linux/swap.h> /* For nr_free_buffer_pages() */
|
|
||||||
#include <linux/list.h>
|
#include <linux/list.h>
|
||||||
|
|
||||||
#include <linux/debugfs.h>
|
#include <linux/debugfs.h>
|
||||||
|
|
|
@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void)
|
||||||
if (proc_mtd)
|
if (proc_mtd)
|
||||||
remove_proc_entry("mtd", NULL);
|
remove_proc_entry("mtd", NULL);
|
||||||
class_unregister(&mtd_class);
|
class_unregister(&mtd_class);
|
||||||
|
bdi_unregister(mtd_bdi);
|
||||||
bdi_put(mtd_bdi);
|
bdi_put(mtd_bdi);
|
||||||
idr_destroy(&mtd_idr);
|
idr_destroy(&mtd_idr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
#include <linux/of.h>
|
#include <linux/of.h>
|
||||||
#include <linux/of_fdt.h>
|
#include <linux/of_fdt.h>
|
||||||
#include <linux/random.h>
|
#include <linux/random.h>
|
||||||
|
#include <linux/slab.h>
|
||||||
#include <linux/types.h>
|
#include <linux/types.h>
|
||||||
|
|
||||||
#define RNG_SEED_SIZE 128
|
#define RNG_SEED_SIZE 128
|
||||||
|
@ -170,8 +171,7 @@ int ima_free_kexec_buffer(void)
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
return memblock_free(addr, size);
|
return memblock_phys_free(addr, size);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -46,7 +46,7 @@ static int __init early_init_dt_alloc_reserved_memory_arch(phys_addr_t size,
|
||||||
if (nomap) {
|
if (nomap) {
|
||||||
err = memblock_mark_nomap(base, size);
|
err = memblock_mark_nomap(base, size);
|
||||||
if (err)
|
if (err)
|
||||||
memblock_free(base, size);
|
memblock_phys_free(base, size);
|
||||||
kmemleak_ignore_phys(base);
|
kmemleak_ignore_phys(base);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -284,7 +284,8 @@ void __init fdt_init_reserved_mem(void)
|
||||||
if (nomap)
|
if (nomap)
|
||||||
memblock_clear_nomap(rmem->base, rmem->size);
|
memblock_clear_nomap(rmem->base, rmem->size);
|
||||||
else
|
else
|
||||||
memblock_free(rmem->base, rmem->size);
|
memblock_phys_free(rmem->base,
|
||||||
|
rmem->size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -965,6 +965,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||||
struct rio_transfer_io *transfer;
|
struct rio_transfer_io *transfer;
|
||||||
enum dma_data_direction dir;
|
enum dma_data_direction dir;
|
||||||
int i, ret = 0;
|
int i, ret = 0;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
|
if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
|
||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
|
@ -976,13 +977,14 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||||
priv->md->properties.transfer_mode) == 0)
|
priv->md->properties.transfer_mode) == 0)
|
||||||
return -ENODEV;
|
return -ENODEV;
|
||||||
|
|
||||||
transfer = vmalloc(array_size(sizeof(*transfer), transaction.count));
|
size = array_size(sizeof(*transfer), transaction.count);
|
||||||
|
transfer = vmalloc(size);
|
||||||
if (!transfer)
|
if (!transfer)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
if (unlikely(copy_from_user(transfer,
|
if (unlikely(copy_from_user(transfer,
|
||||||
(void __user *)(uintptr_t)transaction.block,
|
(void __user *)(uintptr_t)transaction.block,
|
||||||
array_size(sizeof(*transfer), transaction.count)))) {
|
size))) {
|
||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
goto out_free;
|
goto out_free;
|
||||||
}
|
}
|
||||||
|
@ -994,8 +996,7 @@ static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
|
||||||
transaction.sync, dir, &transfer[i]);
|
transaction.sync, dir, &transfer[i]);
|
||||||
|
|
||||||
if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block,
|
if (unlikely(copy_to_user((void __user *)(uintptr_t)transaction.block,
|
||||||
transfer,
|
transfer, size)))
|
||||||
array_size(sizeof(*transfer), transaction.count))))
|
|
||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
|
|
||||||
out_free:
|
out_free:
|
||||||
|
|
|
@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info)
|
||||||
}
|
}
|
||||||
sclp_fill_core_info(info, sccb);
|
sclp_fill_core_info(info, sccb);
|
||||||
out:
|
out:
|
||||||
memblock_free_early((unsigned long)sccb, length);
|
memblock_phys_free((unsigned long)sccb, length);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -185,7 +185,7 @@ static void __init xdbc_free_ring(struct xdbc_ring *ring)
|
||||||
if (!seg)
|
if (!seg)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
memblock_free(seg->dma, PAGE_SIZE);
|
memblock_phys_free(seg->dma, PAGE_SIZE);
|
||||||
ring->segment = NULL;
|
ring->segment = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -665,10 +665,10 @@ int __init early_xdbc_setup_hardware(void)
|
||||||
xdbc_free_ring(&xdbc.in_ring);
|
xdbc_free_ring(&xdbc.in_ring);
|
||||||
|
|
||||||
if (xdbc.table_dma)
|
if (xdbc.table_dma)
|
||||||
memblock_free(xdbc.table_dma, PAGE_SIZE);
|
memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
|
||||||
|
|
||||||
if (xdbc.out_dma)
|
if (xdbc.out_dma)
|
||||||
memblock_free(xdbc.out_dma, PAGE_SIZE);
|
memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
|
||||||
|
|
||||||
xdbc.table_base = NULL;
|
xdbc.table_base = NULL;
|
||||||
xdbc.out_buf = NULL;
|
xdbc.out_buf = NULL;
|
||||||
|
@ -987,8 +987,8 @@ free_and_quit:
|
||||||
xdbc_free_ring(&xdbc.evt_ring);
|
xdbc_free_ring(&xdbc.evt_ring);
|
||||||
xdbc_free_ring(&xdbc.out_ring);
|
xdbc_free_ring(&xdbc.out_ring);
|
||||||
xdbc_free_ring(&xdbc.in_ring);
|
xdbc_free_ring(&xdbc.in_ring);
|
||||||
memblock_free(xdbc.table_dma, PAGE_SIZE);
|
memblock_phys_free(xdbc.table_dma, PAGE_SIZE);
|
||||||
memblock_free(xdbc.out_dma, PAGE_SIZE);
|
memblock_phys_free(xdbc.out_dma, PAGE_SIZE);
|
||||||
writel(0, &xdbc.xdbc_reg->control);
|
writel(0, &xdbc.xdbc_reg->control);
|
||||||
early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
|
early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@ config VIRTIO_MEM
|
||||||
default m
|
default m
|
||||||
depends on X86_64
|
depends on X86_64
|
||||||
depends on VIRTIO
|
depends on VIRTIO
|
||||||
depends on MEMORY_HOTPLUG_SPARSE
|
depends on MEMORY_HOTPLUG
|
||||||
depends on MEMORY_HOTREMOVE
|
depends on MEMORY_HOTREMOVE
|
||||||
depends on CONTIG_ALLOC
|
depends on CONTIG_ALLOC
|
||||||
help
|
help
|
||||||
|
|
|
@ -241,7 +241,7 @@ retry:
|
||||||
*/
|
*/
|
||||||
rc = xen_swiotlb_fixup(start, nslabs);
|
rc = xen_swiotlb_fixup(start, nslabs);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
memblock_free(__pa(start), PAGE_ALIGN(bytes));
|
memblock_free(start, PAGE_ALIGN(bytes));
|
||||||
if (nslabs > 1024 && repeat--) {
|
if (nslabs > 1024 && repeat--) {
|
||||||
/* Min is 2MB */
|
/* Min is 2MB */
|
||||||
nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
|
nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE));
|
||||||
|
|
|
@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* prepend_name - prepend a pathname in front of current buffer pointer
|
* prepend_name - prepend a pathname in front of current buffer pointer
|
||||||
* @buffer: buffer pointer
|
* @p: prepend buffer which contains buffer pointer and allocated length
|
||||||
* @buflen: allocated length of the buffer
|
* @name: name string and length qstr structure
|
||||||
* @name: name string and length qstr structure
|
|
||||||
*
|
*
|
||||||
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
|
* With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
|
||||||
* make sure that either the old or the new name pointer and length are
|
* make sure that either the old or the new name pointer and length are
|
||||||
|
@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
|
||||||
* prepend_path - Prepend path string to a buffer
|
* prepend_path - Prepend path string to a buffer
|
||||||
* @path: the dentry/vfsmount to report
|
* @path: the dentry/vfsmount to report
|
||||||
* @root: root vfsmnt/dentry
|
* @root: root vfsmnt/dentry
|
||||||
* @buffer: pointer to the end of the buffer
|
* @p: prepend buffer which contains buffer pointer and allocated length
|
||||||
* @buflen: pointer to buffer length
|
|
||||||
*
|
*
|
||||||
* The function will first try to write out the pathname without taking any
|
* The function will first try to write out the pathname without taking any
|
||||||
* lock other than the RCU read lock to make sure that dentries won't go away.
|
* lock other than the RCU read lock to make sure that dentries won't go away.
|
||||||
|
|
|
@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
|
||||||
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
|
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
|
||||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
|
ocfs2_commit_trans(osb, handle);
|
||||||
mlog_errno(status);
|
mlog_errno(status);
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
|
@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
|
||||||
data_alloc_bh, start_blk,
|
data_alloc_bh, start_blk,
|
||||||
num_clusters);
|
num_clusters);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
|
ocfs2_commit_trans(osb, handle);
|
||||||
mlog_errno(status);
|
mlog_errno(status);
|
||||||
goto bail;
|
goto bail;
|
||||||
}
|
}
|
||||||
|
@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Zero the area past i_size but still within an allocated
|
* Zero partial cluster for a hole punch or truncate. This avoids exposing
|
||||||
* cluster. This avoids exposing nonzero data on subsequent file
|
* nonzero data on subsequent file extends.
|
||||||
* extends.
|
|
||||||
*
|
*
|
||||||
* We need to call this before i_size is updated on the inode because
|
* We need to call this before i_size is updated on the inode because
|
||||||
* otherwise block_write_full_page() will skip writeout of pages past
|
* otherwise block_write_full_page() will skip writeout of pages past
|
||||||
* i_size. The new_i_size parameter is passed for this reason.
|
* i_size.
|
||||||
*/
|
*/
|
||||||
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||||
u64 range_start, u64 range_end)
|
u64 range_start, u64 range_end)
|
||||||
|
@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||||
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
|
if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Avoid zeroing pages fully beyond current i_size. It is pointless as
|
||||||
|
* underlying blocks of those pages should be already zeroed out and
|
||||||
|
* page writeback will skip them anyway.
|
||||||
|
*/
|
||||||
|
range_end = min_t(u64, range_end, i_size_read(inode));
|
||||||
|
if (range_start >= range_end)
|
||||||
|
return 0;
|
||||||
|
|
||||||
pages = kcalloc(ocfs2_pages_per_cluster(sb),
|
pages = kcalloc(ocfs2_pages_per_cluster(sb),
|
||||||
sizeof(struct page *), GFP_NOFS);
|
sizeof(struct page *), GFP_NOFS);
|
||||||
if (pages == NULL) {
|
if (pages == NULL) {
|
||||||
|
@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (range_start == range_end)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
ret = ocfs2_extent_map_get_blocks(inode,
|
ret = ocfs2_extent_map_get_blocks(inode,
|
||||||
range_start >> sb->s_blocksize_bits,
|
range_start >> sb->s_blocksize_bits,
|
||||||
&phys, NULL, &ext_flags);
|
&phys, NULL, &ext_flags);
|
||||||
|
|
|
@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
retry:
|
retry:
|
||||||
ret = -EINVAL;
|
|
||||||
mlog(0, "attempting to send begin reco msg to %d\n",
|
mlog(0, "attempting to send begin reco msg to %d\n",
|
||||||
nodenum);
|
nodenum);
|
||||||
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
|
ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
|
||||||
|
|
|
@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode,
|
||||||
* greater than page size, so we have to truncate them
|
* greater than page size, so we have to truncate them
|
||||||
* anyway.
|
* anyway.
|
||||||
*/
|
*/
|
||||||
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
|
|
||||||
truncate_inode_pages(inode->i_mapping, new_i_size);
|
|
||||||
|
|
||||||
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
||||||
|
unmap_mapping_range(inode->i_mapping,
|
||||||
|
new_i_size + PAGE_SIZE - 1, 0, 1);
|
||||||
|
truncate_inode_pages(inode->i_mapping, new_i_size);
|
||||||
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
|
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
|
||||||
i_size_read(inode), 1);
|
i_size_read(inode), 1);
|
||||||
if (status)
|
if (status)
|
||||||
|
@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode,
|
||||||
goto bail_unlock_sem;
|
goto bail_unlock_sem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
|
||||||
|
truncate_inode_pages(inode->i_mapping, new_i_size);
|
||||||
|
|
||||||
status = ocfs2_commit_truncate(osb, inode, di_bh);
|
status = ocfs2_commit_truncate(osb, inode, di_bh);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
mlog_errno(status);
|
mlog_errno(status);
|
||||||
|
|
|
@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
|
||||||
struct inode *inode = NULL;
|
struct inode *inode = NULL;
|
||||||
struct super_block *sb = osb->sb;
|
struct super_block *sb = osb->sb;
|
||||||
struct ocfs2_find_inode_args args;
|
struct ocfs2_find_inode_args args;
|
||||||
journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
|
|
||||||
|
|
||||||
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
|
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
|
||||||
sysfile_type);
|
sysfile_type);
|
||||||
|
@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
|
||||||
* part of the transaction - the inode could have been reclaimed and
|
* part of the transaction - the inode could have been reclaimed and
|
||||||
* now it is reread from disk.
|
* now it is reread from disk.
|
||||||
*/
|
*/
|
||||||
if (journal) {
|
if (osb->journal) {
|
||||||
transaction_t *transaction;
|
transaction_t *transaction;
|
||||||
tid_t tid;
|
tid_t tid;
|
||||||
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
||||||
|
journal_t *journal = osb->journal->j_journal;
|
||||||
|
|
||||||
read_lock(&journal->j_state_lock);
|
read_lock(&journal->j_state_lock);
|
||||||
if (journal->j_running_transaction)
|
if (journal->j_running_transaction)
|
||||||
|
|
|
@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
|
||||||
write_unlock(&journal->j_state_lock);
|
write_unlock(&journal->j_state_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
|
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
|
||||||
{
|
{
|
||||||
int status = -1;
|
int status = -1;
|
||||||
struct inode *inode = NULL; /* the journal inode */
|
struct inode *inode = NULL; /* the journal inode */
|
||||||
journal_t *j_journal = NULL;
|
journal_t *j_journal = NULL;
|
||||||
|
struct ocfs2_journal *journal = NULL;
|
||||||
struct ocfs2_dinode *di = NULL;
|
struct ocfs2_dinode *di = NULL;
|
||||||
struct buffer_head *bh = NULL;
|
struct buffer_head *bh = NULL;
|
||||||
struct ocfs2_super *osb;
|
|
||||||
int inode_lock = 0;
|
int inode_lock = 0;
|
||||||
|
|
||||||
BUG_ON(!journal);
|
/* initialize our journal structure */
|
||||||
|
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
|
||||||
|
if (!journal) {
|
||||||
|
mlog(ML_ERROR, "unable to alloc journal\n");
|
||||||
|
status = -ENOMEM;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
osb->journal = journal;
|
||||||
|
journal->j_osb = osb;
|
||||||
|
|
||||||
osb = journal->j_osb;
|
atomic_set(&journal->j_num_trans, 0);
|
||||||
|
init_rwsem(&journal->j_trans_barrier);
|
||||||
|
init_waitqueue_head(&journal->j_checkpointed);
|
||||||
|
spin_lock_init(&journal->j_lock);
|
||||||
|
journal->j_trans_id = 1UL;
|
||||||
|
INIT_LIST_HEAD(&journal->j_la_cleanups);
|
||||||
|
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
|
||||||
|
journal->j_state = OCFS2_JOURNAL_FREE;
|
||||||
|
|
||||||
/* already have the inode for our journal */
|
/* already have the inode for our journal */
|
||||||
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
|
inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
|
||||||
|
@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
|
||||||
|
|
||||||
journal->j_state = OCFS2_JOURNAL_FREE;
|
journal->j_state = OCFS2_JOURNAL_FREE;
|
||||||
|
|
||||||
// up_write(&journal->j_trans_barrier);
|
|
||||||
done:
|
done:
|
||||||
iput(inode);
|
iput(inode);
|
||||||
|
kfree(journal);
|
||||||
|
osb->journal = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ocfs2_clear_journal_error(struct super_block *sb,
|
static void ocfs2_clear_journal_error(struct super_block *sb,
|
||||||
|
|
|
@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
|
||||||
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
|
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
|
||||||
*/
|
*/
|
||||||
void ocfs2_set_journal_params(struct ocfs2_super *osb);
|
void ocfs2_set_journal_params(struct ocfs2_super *osb);
|
||||||
int ocfs2_journal_init(struct ocfs2_journal *journal,
|
int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty);
|
||||||
int *dirty);
|
|
||||||
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
|
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
|
||||||
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
|
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
|
||||||
int full);
|
int full);
|
||||||
|
|
|
@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
|
||||||
/* This will disable recovery and flush any recovery work. */
|
/* This will disable recovery and flush any recovery work. */
|
||||||
ocfs2_recovery_exit(osb);
|
ocfs2_recovery_exit(osb);
|
||||||
|
|
||||||
ocfs2_journal_shutdown(osb);
|
|
||||||
|
|
||||||
ocfs2_sync_blockdev(sb);
|
ocfs2_sync_blockdev(sb);
|
||||||
|
|
||||||
ocfs2_purge_refcount_trees(osb);
|
ocfs2_purge_refcount_trees(osb);
|
||||||
|
@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
|
||||||
|
|
||||||
ocfs2_release_system_inodes(osb);
|
ocfs2_release_system_inodes(osb);
|
||||||
|
|
||||||
|
ocfs2_journal_shutdown(osb);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we're dismounting due to mount error, mount.ocfs2 will clean
|
* If we're dismounting due to mount error, mount.ocfs2 will clean
|
||||||
* up heartbeat. If we're a local mount, there is no heartbeat.
|
* up heartbeat. If we're a local mount, there is no heartbeat.
|
||||||
|
@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
|
||||||
int i, cbits, bbits;
|
int i, cbits, bbits;
|
||||||
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
|
||||||
struct inode *inode = NULL;
|
struct inode *inode = NULL;
|
||||||
struct ocfs2_journal *journal;
|
|
||||||
struct ocfs2_super *osb;
|
struct ocfs2_super *osb;
|
||||||
u64 total_blocks;
|
u64 total_blocks;
|
||||||
|
|
||||||
|
@ -2197,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
|
||||||
|
|
||||||
get_random_bytes(&osb->s_next_generation, sizeof(u32));
|
get_random_bytes(&osb->s_next_generation, sizeof(u32));
|
||||||
|
|
||||||
/* FIXME
|
|
||||||
* This should be done in ocfs2_journal_init(), but unknown
|
|
||||||
* ordering issues will cause the filesystem to crash.
|
|
||||||
* If anyone wants to figure out what part of the code
|
|
||||||
* refers to osb->journal before ocfs2_journal_init() is run,
|
|
||||||
* be my guest.
|
|
||||||
*/
|
|
||||||
/* initialize our journal structure */
|
|
||||||
|
|
||||||
journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL);
|
|
||||||
if (!journal) {
|
|
||||||
mlog(ML_ERROR, "unable to alloc journal\n");
|
|
||||||
status = -ENOMEM;
|
|
||||||
goto bail;
|
|
||||||
}
|
|
||||||
osb->journal = journal;
|
|
||||||
journal->j_osb = osb;
|
|
||||||
|
|
||||||
atomic_set(&journal->j_num_trans, 0);
|
|
||||||
init_rwsem(&journal->j_trans_barrier);
|
|
||||||
init_waitqueue_head(&journal->j_checkpointed);
|
|
||||||
spin_lock_init(&journal->j_lock);
|
|
||||||
journal->j_trans_id = (unsigned long) 1;
|
|
||||||
INIT_LIST_HEAD(&journal->j_la_cleanups);
|
|
||||||
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
|
|
||||||
journal->j_state = OCFS2_JOURNAL_FREE;
|
|
||||||
|
|
||||||
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
|
INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
|
||||||
init_llist_head(&osb->dquot_drop_list);
|
init_llist_head(&osb->dquot_drop_list);
|
||||||
|
|
||||||
|
@ -2404,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
|
||||||
* ourselves. */
|
* ourselves. */
|
||||||
|
|
||||||
/* Init our journal object. */
|
/* Init our journal object. */
|
||||||
status = ocfs2_journal_init(osb->journal, &dirty);
|
status = ocfs2_journal_init(osb, &dirty);
|
||||||
if (status < 0) {
|
if (status < 0) {
|
||||||
mlog(ML_ERROR, "Could not initialize journal!\n");
|
mlog(ML_ERROR, "Could not initialize journal!\n");
|
||||||
goto finally;
|
goto finally;
|
||||||
|
@ -2513,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
|
||||||
|
|
||||||
kfree(osb->osb_orphan_wipes);
|
kfree(osb->osb_orphan_wipes);
|
||||||
kfree(osb->slot_recovery_generations);
|
kfree(osb->slot_recovery_generations);
|
||||||
/* FIXME
|
|
||||||
* This belongs in journal shutdown, but because we have to
|
|
||||||
* allocate osb->journal at the start of ocfs2_initialize_osb(),
|
|
||||||
* we free it here.
|
|
||||||
*/
|
|
||||||
kfree(osb->journal);
|
|
||||||
kfree(osb->local_alloc_copy);
|
kfree(osb->local_alloc_copy);
|
||||||
kfree(osb->uuid_str);
|
kfree(osb->uuid_str);
|
||||||
kfree(osb->vol_label);
|
kfree(osb->vol_label);
|
||||||
|
|
16
fs/open.c
16
fs/open.c
|
@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f,
|
||||||
* of THPs into the page cache will fail.
|
* of THPs into the page cache will fail.
|
||||||
*/
|
*/
|
||||||
smp_mb();
|
smp_mb();
|
||||||
if (filemap_nr_thps(inode->i_mapping))
|
if (filemap_nr_thps(inode->i_mapping)) {
|
||||||
truncate_pagecache(inode, 0);
|
struct address_space *mapping = inode->i_mapping;
|
||||||
|
|
||||||
|
filemap_invalidate_lock(inode->i_mapping);
|
||||||
|
/*
|
||||||
|
* unmap_mapping_range just need to be called once
|
||||||
|
* here, because the private pages is not need to be
|
||||||
|
* unmapped mapping (e.g. data segment of dynamic
|
||||||
|
* shared libraries here).
|
||||||
|
*/
|
||||||
|
unmap_mapping_range(mapping, 0, 0, 0);
|
||||||
|
truncate_inode_pages(mapping, 0);
|
||||||
|
filemap_invalidate_unlock(inode->i_mapping);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче