File numactl- of Package numactl
07070100000000000041ED0000000000000000000000016319106A00000000000000000000000000000000000000000000002200000000numactl- For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ "master" ] pull_request: # The branches below must be a subset of the branches above branches: [ "master" ] schedule: - cron: '25 18 * * 6' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ 'cpp' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at steps: - name: Checkout repository uses: actions/checkout@v3 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # Details on CodeQL's query packs refer to : # queries: security-extended,security-and-quality # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v2 # âšī¸ Command-line programs to run using the OS shell. # đ See # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. # - run: | # echo "Run, Build Application using script" # ./location_of_script_within_repo/ - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 07070100000003000081A40000000000000000000000016319106A00000273000000000000000000000000000000000000003900000000numactl- Makefile CI on: push: branches: [ "master", "action-1" ] pull_request: branches: [ "master", "action-1" ] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: configure run: ./ && ./configure - name: Install dependencies run: make - name: Werror run run: make clean ; CFLAGS="-O2 -Wall -Werror" make - name: clang run run: make clean ; CFLAGS="-O2 -Wall -Werror" CC=clang make - name: Run check run: make check - name: Run distcheck run: make distcheck 07070100000004000081A40000000000000000000000016319106A00000303000000000000000000000000000000000000002500000000numactl- Object files: *.o *.lo # Auxiliary build files: .deps .libs .dirstamp # libnuma libraries: / /libnuma.a / /* # Built binaries: /memhog /migratepages /migspeed /numactl /numademo /numamon /numastat /stream # Built test cases: /test/distance /test/ftok /test/mbind_mig_pages /test/migrate_pages /test/move_pages /test/mynode /test/node-parse /test/nodemap /test/pagesize /test/prefered /test/randmap /test/realloc_test /test/tbitmap /test/tshared # Files generated by autoreconf: / /aclocal.m4 /autom4te.cache /build-aux / /configure # Files generated by configure: /Makefile /config.h /config.log /config.status /libtool /stamp-* # Test logs: /test-suite.log /test/*.log /test/*.trs # pkg-config file numa.pc 07070100000005000081A40000000000000000000000016319106A0000011B000000000000000000000000000000000000002700000000numactl- numactl jobs: - job: copr_build metadata: targets: - fedora-30-x86_64 - fedora-31-x86_64 - fedora-rawhide-x86_64 trigger: pull_request specfile_path: numactl.spec synced_files: - numactl.spec - .packit.yaml upstream_package_name: numactl 07070100000006000081A40000000000000000000000016319106A000006E8000000000000000000000000000000000000002600000000numactl- bionic language: c env: global: - MAKEFLAGS="-j 2" # parallelize compilation process matrix: include: - name: "gcc 10" env: MATRIX_ENV="CC=gcc-10 CXX=g++-10" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-10 - name: "gcc 9" env: MATRIX_ENV="CC=gcc-9 CXX=g++-9" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-9 - name: "gcc 8" env: MATRIX_ENV="CC=gcc-8 CXX=g++-8" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-8 - name: "gcc 7" env: MATRIX_ENV="CC=gcc-7 CXX=g++-7" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-7 - name: "gcc 6" env: MATRIX_ENV="CC=gcc-6 CXX=g++-6" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-6 - name: "gcc 5" dist: xenial env: MATRIX_ENV="CC=gcc-5 CXX=g++-5" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-5 - name: "gcc 4.9" dist: xenial env: MATRIX_ENV="CC=gcc-4.9 CXX=g++-4.9" addons: apt: sources: - ubuntu-toolchain-r-test packages: - gcc-4.9 before_install: - eval "${MATRIX_ENV}" before_script: - ./ - ./configure CPPFLAGS=-Werror script: - make -k - ./numactl --show - make -k check VERBOSE=1 TESTS='test/distance test/nodemap test/numademo test/tbitmap' - make distcheck LOG_COMPILER='sh -c "exit 77"' 07070100000007000081A40000000000000000000000016319106A0000074D000000000000000000000000000000000000002500000000numactl- Building `numactl` TL;DR: ```shell $ ./ $ ./configure $ make # make install ``` Start by configuring the build running the configure script: ```shell $ ./configure ``` You can pass options to configure to define build options, to pass it compiler paths, compiler flags and to define the installation layout. Run `./configure --help` for more details on how to customize the build. Once build is completed, build `numactl` with: ```shell $ make ``` If you would like to increase verbosity by printing the full build command lines, pass `make` the `V=1` parameter: ```shell $ make V=1 ``` You can build and run the tests included with numactl with the following command: ```shell $ make check ``` The results will be saved in `test/*.log` files and a `test-suite.log` will be generated with the summary of test passes and failures. Install numactl to the system by running the following command as root: ```shell # make install ``` You can also install it to a staging directory, in which case it is not required to be root while running the install steps. Just pass a DESTDIR variable while running `make install` with the path to the staging directory. ```shell $ make install DESTDIR=/path/to/staging/numactl ``` ## Using a snapshot from the Git repository First, the build system files need to be generated using the `./` script, which calls `autoreconf` with the appropriate options to generate the configure script and the templates for `Makefile`, `config.h`, etc. Once those files are generated, follow the normal steps to configure and build numactl. In order to create a distribution tarball, use `make dist` from a configured build tree. Use `make distcheck` to build a distribution tarball and confirm that rebuilding from that archive works as expected, that building from out-of-tree works, that test cases pass. 07070100000008000081A40000000000000000000000016319106A00004643000000000000000000000000000000000000002700000000numactl- GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. <one line to give the program's name and a brief idea of what it does.> Copyright (C) <year> <name of author> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. <signature of Ty Coon>, 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. It consists of a numactl program to run other programs with a specific NUMA policy and a libnuma shared library ("NUMA API") to set NUMA policy in applications. The libnuma binary interface is supposed to stay binary compatible. Incompatible changes will use new symbol version numbers. In addition there are various test and utility programs, like `numastat` to display NUMA allocation statistics and `memhog`. In `test/` there is a small regression test suite. Note that `regress` assumes an unloaded machine with memory free on each node. Otherwise you will get spurious failures in the non-strict policies (preferred, interleave.) See the manpages [`numactl.8`]( and [`numa.3`]( for details. # License, Copyrights, Acknowledgements `numactl` and the demo programs are under the GNU General Public License, v.2. `libnuma` is under the GNU Lesser General Public License, v2.1. The manpages are under the same license as the Linux manpages (see the files.) `numademo` links with a library derived from the C version of STREAM by John D. McCalpin and Joe R. Zagar for one sub benchmark. See `stream_lib.c` for the license. In particular when you publish `numademo` output you might need to pay attention there or filter out the STREAM results. It also uses a public domain Mersenne Twister implementation from Michael Brundage. Version 2.0.10-rc2: (C)2014 SGI Author: Andi Kleen, SUSE Labs Version 2.0.0 by Cliff Wickman (``), Christoph Lameter (``) and Lee Schermerhorn (``). 0707010000000C000081A40000000000000000000000016319106A000020C4000000000000000000000000000000000000002500000000numactl-* Support for specifying IO affinity by various means. Copyright 2010 Intel Corporation Author: Andi Kleen libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Notebook: - Separate real errors from no NUMA with fallback - Infiniband - FCoE? - Support for other special IO devices - Specifying cpu subsets inside the IO node? - Handle multiple IO nodes (needs kernel changes) - Better support for multi-path IO? */ #define _GNU_SOURCE 1 #include <string.h> #include <errno.h> #include <sys/stat.h> #include <netdb.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> #include <sys/ioctl.h> #include <net/if.h> #include <dirent.h> #include <linux/rtnetlink.h> #include <linux/netlink.h> #include <sys/types.h> #include <sys/sysmacros.h> #include <ctype.h> #include <assert.h> #include <regex.h> #include <sys/sysmacros.h> #include "numa.h" #include "numaint.h" #include "sysfs.h" #include "affinity.h" #include "rtnetlink.h" static int badchar(const char *s) { if (strpbrk(s, "/.")) return 1; return 0; } static int node_parse_failure(int ret, char *cls, const char *dev) { if (!cls) cls = ""; if (ret == -2) numa_warn(W_node_parse1, "Kernel does not know node mask for%s%s device `%s'", *cls ? " " : "", cls, dev); else numa_warn(W_node_parse2, "Cannot read node mask for %s device `%s'", cls, dev); return -1; } /* Generic sysfs class lookup */ static int affinity_class(struct bitmask *mask, char *cls, const char *dev) { int ret; while (isspace(*dev)) dev++; if (badchar(dev)) { numa_warn(W_badchar, "Illegal characters in `%s' specification", dev); return -1; } /* Somewhat hackish: extract device from symlink path. Better would be a direct backlink. This knows slightly too much about the actual sysfs layout. */ char path[1024]; char *fn = NULL; if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 && readlink(fn, path, sizeof path) > 0) { regex_t re; regmatch_t match[2]; char *p; regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/", REG_EXTENDED); ret = regexec(&re, path, 2, match, 0); regfree(&re); if (ret == 0) { free(fn); assert(match[0].rm_so > 0); assert(match[0].rm_eo > 0); path[match[1].rm_eo + 1] = 0; p = path + match[0].rm_so; ret = sysfs_node_read(mask, "/sys/%s/numa_node", p); if (ret < 0) return node_parse_failure(ret, NULL, p); return ret; } } free(fn); ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", cls, dev); if (ret < 0) return node_parse_failure(ret, cls, dev); return 0; } /* Turn file (or device node) into class name */ static int affinity_file(struct bitmask *mask, char *cls, const char *file) { struct stat st; DIR *dir; int n; unsigned maj = 0, min = 0; dev_t d; struct dirent *dep; cls = "block"; char fn[sizeof("/sys/class/") + strlen(cls)]; if (stat(file, &st) < 0) { numa_warn(W_blockdev1, "Cannot stat file %s", file); return -1; } d = st.st_dev; if (S_ISCHR(st.st_mode)) { /* Better choice than misc? Most likely misc will not work anyways unless the kernel is fixed. */ cls = "misc"; d = st.st_rdev; } else if (S_ISBLK(st.st_mode)) d = st.st_rdev; sprintf(fn, "/sys/class/%s", cls); dir = opendir(fn); if (!dir) { numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs", cls); return -1; } while ((dep = readdir(dir)) != NULL) { char *name = dep->d_name; int ret; if (*name == '.') continue; char *dev; char fn2[sizeof("/sys/class/block//dev") + strlen(name)]; n = -1; if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0) break; dev = sysfs_read(fn2); if (dev) { n = sscanf(dev, "%u:%u", &maj, &min); free(dev); } if (n != 2) { numa_warn(W_blockdev3, "Cannot parse sysfs device %s", name); continue; } if (major(d) != maj || minor(d) != min) continue; ret = affinity_class(mask, "block", name); closedir(dir); return ret; } closedir(dir); numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'", maj, min, file); return -1; } /* Look up interface of route using rtnetlink. */ static int find_route(struct sockaddr *dst, int *iifp) { struct rtattr *rta; const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg)); struct { struct nlmsghdr msg; struct rtmsg rt; char buf[256]; } req = { .msg = { .nlmsg_len = hdrlen, .nlmsg_type = RTM_GETROUTE, .nlmsg_flags = NLM_F_REQUEST, }, .rt = { .rtm_family = dst->sa_family, }, }; struct sockaddr_nl adr = { .nl_family = AF_NETLINK, }; if (rta_put_address(&req.msg, RTA_DST, dst) < 0) { numa_warn(W_netlink1, "Cannot handle network family %x", dst->sa_family); return -1; } if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) { numa_warn(W_netlink2, "Cannot request rtnetlink route: %s", strerror(errno)); return -1; } /* Fish the interface out of the netlink soup. */ rta = NULL; while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) { if (rta->rta_type == RTA_OIF) { memcpy(iifp, RTA_DATA(rta), sizeof(int)); return 0; } } numa_warn(W_netlink3, "rtnetlink query did not return interface"); return -1; } static int iif_to_name(int iif, struct ifreq *ifr) { int n; int sk = socket(PF_INET, SOCK_DGRAM, 0); if (sk < 0) return -1; ifr->ifr_ifindex = iif; n = ioctl(sk, SIOCGIFNAME, ifr); close(sk); return n; } /* Resolve an IP address to the nodes of a network device. This generally only attempts to handle simple cases: no multi-path, no bounding etc. In these cases only the first interface or none is chosen. */ static int affinity_ip(struct bitmask *mask, char *cls, const char *id) { struct addrinfo *ai; int n; int iif; struct ifreq ifr; if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) { numa_warn(W_net1, "Cannot resolve %s: %s", id, gai_strerror(n)); return -1; } if (find_route(&ai->ai_addr[0], &iif) < 0) goto out_ai; if (iif_to_name(iif, &ifr) < 0) { numa_warn(W_net2, "Cannot resolve network interface %d", iif); goto out_ai; } freeaddrinfo(ai); return affinity_class(mask, "net", ifr.ifr_name); out_ai: freeaddrinfo(ai); return -1; } /* Look up affinity for a PCI device */ static int affinity_pci(struct bitmask *mask, char *cls, const char *id) { unsigned seg, bus, dev, func; int n, ret; /* Func is optional. */ if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) { if (n == 3) func = 0; } /* Segment is optional too */ else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) { seg = 0; if (n == 2) func = 0; } else { numa_warn(W_pci1, "Cannot parse PCI device `%s'", id); return -1; } ret = sysfs_node_read(mask, "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", seg, bus, seg, bus, dev, func); if (ret < 0) return node_parse_failure(ret, cls, id); return 0; } static struct handler { char first; char *name; char *cls; int (*handler)(struct bitmask *mask, char *cls, const char *desc); } handlers[] = { { 'n', "netdev:", "net", affinity_class }, { 'i', "ip:", NULL, affinity_ip }, { 'f', "file:", NULL, affinity_file }, { 'b', "block:", "block", affinity_class }, { 'p', "pci:", NULL, affinity_pci }, {} }; hidden int resolve_affinity(const char *id, struct bitmask *mask) { struct handler *h; for (h = &handlers[0]; h->first; h++) { int len; if (id[0] != h->first) continue; len = strlen(h->name); if (!strncmp(id, h->name, len)) { int ret = h->handler(mask, h->cls, id + len); if (ret == -2) { numa_warn(W_nonode, "Kernel does not know node for %s\n", id + len); } return ret; } } return NO_IO_AFFINITY; } 0707010000000D000081A40000000000000000000000016319106A0000005C000000000000000000000000000000000000002500000000numactl- { NO_IO_AFFINITY = -2 }; int resolve_affinity(const char *id, struct bitmask *mask); 0707010000000E000081ED0000000000000000000000016319106A00000032000000000000000000000000000000000000002500000000numactl-!/bin/sh set -e autoreconf --install --symlink 0707010000000F000081A40000000000000000000000016319106A0000079F000000000000000000000000000000000000002700000000numactl-* Clear the CPU cache for benchmark purposes. Pretty simple minded. * Might not work in some complex cache topologies. * When you switch CPUs it's a good idea to clear the cache after testing * too. */ #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include "clearcache.h" static unsigned cache_size(void) { unsigned cs = 0; #ifdef _SC_LEVEL1_DCACHE_SIZE cs += sysconf(_SC_LEVEL1_DCACHE_SIZE); #endif #ifdef _SC_LEVEL2_CACHE_SIZE cs += sysconf(_SC_LEVEL2_CACHE_SIZE); #endif #ifdef _SC_LEVEL3_CACHE_SIZE cs += sysconf(_SC_LEVEL3_CACHE_SIZE); #endif #ifdef _SC_LEVEL4_CACHE_SIZE cs += sysconf(_SC_LEVEL4_CACHE_SIZE); #endif if (cs == 0) { static int warned; if (!warned) { printf("Cannot determine CPU cache size\n"); warned = 1; } cs = 64*1024*1024; } cs *= 2; /* safety factor */ return cs; } static void fallback_clearcache(void) { static unsigned char *clearmem; unsigned cs = cache_size(); unsigned i; if (!clearmem) clearmem = malloc(cs); if (!clearmem) { printf("Warning: cannot allocate %u bytes of clear cache buffer\n", cs); return; } for (i = 0; i < cs; i += 32) clearmem[i] = 1; } void clearcache(unsigned char *mem, unsigned size) { #if defined(__i386__) || defined(__x86_64__) unsigned i, cl, eax, feat; /* get clflush unit and feature */ asm("cpuid" : "=a" (eax), "=b" (cl), "=d" (feat) : "0" (1) : "cx"); if (!(feat & (1 << 19))) fallback_clearcache(); cl = ((cl >> 8) & 0xff) * 8; for (i = 0; i < size; i += cl) asm("clflush %0" :: "m" (mem[i])); #elif defined(__ia64__) unsigned long cl, endcl; // flush probable 128 byte cache lines (but possibly 64 bytes) cl = (unsigned long)mem; endcl = (unsigned long)(mem + (size-1)); for (; cl <= endcl; cl += 64) asm ("fc %0" :: "r"(cl) : "memory" ); #else #warning "Consider adding a clearcache implementation for your architecture" fallback_clearcache(); #endif } 07070100000010000081A40000000000000000000000016319106A00000034000000000000000000000000000000000000002700000000numactl- clearcache(unsigned char *mem, unsigned size); 07070100000011000081A40000000000000000000000016319106A00000446000000000000000000000000000000000000002700000000numactl-[2.64]) AC_INIT([numactl], [2.0.14]) AC_CONFIG_SRCDIR([numactl.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_HEADERS([config.h]) AM_INIT_AUTOMAKE([foreign 1.11 silent-rules subdir-objects parallel-tests]) AM_SILENT_RULES([yes]) LT_PREREQ([2.2]) LT_INIT AC_PROG_CC # Override CFLAGS so that we can specify custom CFLAGS for numademo. AX_AM_OVERRIDE_VAR([CFLAGS]) AX_TLS AX_CHECK_COMPILE_FLAG([-ftree-vectorize], [tree_vectorize="true"]) AM_CONDITIONAL([HAVE_TREE_VECTORIZE], [test x"${tree_vectorize}" = x"true"]) AC_CANONICAL_TARGET AM_CONDITIONAL([RISCV64], [test x"${target_cpu}" = x"riscv64"]) AC_CONFIG_FILES([Makefile]) # GCC tries to be "helpful" and only issue a warning for unrecognized # attributes. So we compile the test with Werror, so that if the # attribute is not recognized the compilation fails AC_LANG(C) AC_LANG_WERROR AC_COMPILE_IFELSE([AC_LANG_SOURCE([[__attribute__ ((symver ("foo@foo_1"))) void frob (void) { }]])], [AC_DEFINE([HAVE_ATTRIBUTE_SYMVER], [1], [Checking for symver attribute])], []) AC_OUTPUT 07070100000012000081A40000000000000000000000016319106A00000B52000000000000000000000000000000000000002500000000numactl-* Discover distances Copyright (C) 2005 Andi Kleen, SuSE Labs. libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA All calls are undefined when numa_available returns an error. */ #define _GNU_SOURCE 1 #include <stdio.h> #include <stdlib.h> #include <errno.h> #include "numa.h" #include "numaint.h" static int distance_numnodes; static int *distance_table; static void parse_numbers(char *s, int *iptr) { int i, d, j; char *end; int maxnode = numa_max_node(); int numnodes = 0; for (i = 0; i <= maxnode; i++) if (numa_bitmask_isbitset(numa_nodes_ptr, i)) numnodes++; for (i = 0, j = 0; i <= maxnode; i++, j++) { d = strtoul(s, &end, 0); /* Skip unavailable nodes */ while (j<=maxnode && !numa_bitmask_isbitset(numa_nodes_ptr, j)) j++; if (s == end) break; *(iptr+j) = d; s = end; } } static int read_distance_table(void) { int nd, len; char *line = NULL; size_t linelen = 0; int maxnode = numa_max_node() + 1; int *table = NULL; int err = -1; for (nd = 0;; nd++) { char fn[100]; FILE *dfh; sprintf(fn, "/sys/devices/system/node/node%d/distance", nd); dfh = fopen(fn, "r"); if (!dfh) { if (errno == ENOENT) err = 0; if (!err && nd<maxnode) continue; else break; } len = getdelim(&line, &linelen, '\n', dfh); fclose(dfh); if (len <= 0) break; if (!table) { table = calloc(maxnode * maxnode, sizeof(int)); if (!table) { errno = ENOMEM; break; } } parse_numbers(line, table + nd * maxnode); } free(line); if (err) { numa_warn(W_distance, "Cannot parse distance information in sysfs: %s", strerror(errno)); free(table); return err; } /* Update the global table pointer. Race window here with other threads, but in the worst case we leak one distance array one time, which is tolerable. This avoids a dependency on pthreads. */ if (distance_table) { free(table); return 0; } distance_numnodes = maxnode; distance_table = table; return 0; } int numa_distance(int a, int b) { if (!distance_table) { int err = read_distance_table(); if ((err < 0) || (!distance_table)) return 0; } if ((unsigned)a >= distance_numnodes || (unsigned)b >= distance_numnodes) return 0; return distance_table[a * distance_numnodes + b]; } 07070100000013000081A40000000000000000000000016319106A0000C2B5000000000000000000000000000000000000002400000000numactl-* Simple NUMA library. Copyright (C) 2003,2004,2005,2008 Andi Kleen,SuSE Labs and Cliff Wickman,SGI. libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA All calls are undefined when numa_available returns an error. */ #define _GNU_SOURCE 1 #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <string.h> #include <sched.h> #include <dirent.h> #include <errno.h> #include <stdarg.h> #include <ctype.h> #include <assert.h> #include <sys/mman.h> #include <limits.h> #include "config.h" #include "numa.h" #include "numaif.h" #include "numaint.h" #include "util.h" #include "affinity.h" #define WEAK __attribute__((weak)) #define CPU_BUFFER_SIZE 4096 /* This limits you to 32768 CPUs */ /* these are the old (version 1) masks */ nodemask_t numa_no_nodes; nodemask_t numa_all_nodes; /* these are now the default bitmask (pointers to) (version 2) */ struct bitmask *numa_no_nodes_ptr = NULL; struct bitmask *numa_all_nodes_ptr = NULL; struct bitmask *numa_possible_nodes_ptr = NULL; struct bitmask *numa_all_cpus_ptr = NULL; struct bitmask *numa_possible_cpus_ptr = NULL; /* I would prefer to use symbol versioning to create v1 and v2 versions of numa_no_nodes and numa_all_nodes, but the loader does not correctly handle versioning of BSS versus small data items */ struct bitmask *numa_nodes_ptr = NULL; static struct bitmask *numa_memnode_ptr = NULL; static unsigned long *node_cpu_mask_v1[NUMA_NUM_NODES]; static char node_cpu_mask_v1_stale = 1; static struct bitmask **node_cpu_mask_v2; static char node_cpu_mask_v2_stale = 1; WEAK void numa_error(char *where); #ifndef TLS #warning "not threadsafe" #define __thread #endif static __thread int bind_policy = MPOL_BIND; static __thread unsigned int mbind_flags = 0; static int sizes_set=0; static int maxconfigurednode = -1; static int maxconfiguredcpu = -1; static int numprocnode = -1; static int numproccpu = -1; static int nodemask_sz = 0; static int cpumask_sz = 0; static int has_preferred_many = 0; int numa_exit_on_error = 0; int numa_exit_on_warn = 0; static void set_sizes(void); /* * There are two special functions, _init(void) and _fini(void), which * are called automatically by the dynamic loader whenever a library is loaded. * * The v1 library depends upon nodemask_t's of all nodes and no nodes. */ void __attribute__((constructor)) numa_init(void) { int max,i; if (sizes_set) return; set_sizes(); /* numa_all_nodes should represent existing nodes on this system */ max = numa_num_configured_nodes(); for (i = 0; i < max; i++) nodemask_set_compat((nodemask_t *)&numa_all_nodes, i); memset(&numa_no_nodes, 0, sizeof(numa_no_nodes)); } static void cleanup_node_cpu_mask_v2(void); #define FREE_AND_ZERO(x) if (x) { \ numa_bitmask_free(x); \ x = NULL; \ } void __attribute__((destructor)) numa_fini(void) { FREE_AND_ZERO(numa_all_cpus_ptr); FREE_AND_ZERO(numa_possible_cpus_ptr); FREE_AND_ZERO(numa_all_nodes_ptr); FREE_AND_ZERO(numa_possible_nodes_ptr); FREE_AND_ZERO(numa_no_nodes_ptr); FREE_AND_ZERO(numa_memnode_ptr); FREE_AND_ZERO(numa_nodes_ptr); cleanup_node_cpu_mask_v2(); } static int numa_find_first(struct bitmask *mask) { int i; for (i = 0; i < mask->size; i++) if (numa_bitmask_isbitset(mask, i)) return i; return -1; } /* * The following bitmask declarations, bitmask_*() routines, and associated * _setbit() and _getbit() routines are: * Copyright (c) 2004_2007 Silicon Graphics, Inc. (SGI) All rights reserved. * SGI publishes it under the terms of the GNU General Public License, v2, * as published by the Free Software Foundation. */ static unsigned int _getbit(const struct bitmask *bmp, unsigned int n) { if (n < bmp->size) return (bmp->maskp[n/bitsperlong] >> (n % bitsperlong)) & 1; else return 0; } static void _setbit(struct bitmask *bmp, unsigned int n, unsigned int v) { if (n < bmp->size) { if (v) bmp->maskp[n/bitsperlong] |= 1UL << (n % bitsperlong); else bmp->maskp[n/bitsperlong] &= ~(1UL << (n % bitsperlong)); } } int numa_bitmask_isbitset(const struct bitmask *bmp, unsigned int i) { return _getbit(bmp, i); } struct bitmask * numa_bitmask_setall(struct bitmask *bmp) { unsigned int i; for (i = 0; i < bmp->size; i++) _setbit(bmp, i, 1); return bmp; } struct bitmask * numa_bitmask_clearall(struct bitmask *bmp) { unsigned int i; for (i = 0; i < bmp->size; i++) _setbit(bmp, i, 0); return bmp; } struct bitmask * numa_bitmask_setbit(struct bitmask *bmp, unsigned int i) { _setbit(bmp, i, 1); return bmp; } struct bitmask * numa_bitmask_clearbit(struct bitmask *bmp, unsigned int i) { _setbit(bmp, i, 0); return bmp; } unsigned int numa_bitmask_nbytes(struct bitmask *bmp) { return longsperbits(bmp->size) * sizeof(unsigned long); } /* where n is the number of bits in the map */ /* This function should not exit on failure, but right now we cannot really recover from this. */ struct bitmask * numa_bitmask_alloc(unsigned int n) { struct bitmask *bmp; if (n < 1) { errno = EINVAL; numa_error("request to allocate mask for invalid number"); exit(1); } bmp = malloc(sizeof(*bmp)); if (bmp == 0) goto oom; bmp->size = n; bmp->maskp = calloc(longsperbits(n), sizeof(unsigned long)); if (bmp->maskp == 0) { free(bmp); goto oom; } return bmp; oom: numa_error("Out of memory allocating bitmask"); exit(1); } void numa_bitmask_free(struct bitmask *bmp) { if (bmp == 0) return; free(bmp->maskp); bmp->maskp = (unsigned long *)0xdeadcdef; /* double free tripwire */ free(bmp); return; } /* True if two bitmasks are equal */ int numa_bitmask_equal(const struct bitmask *bmp1, const struct bitmask *bmp2) { unsigned int i; for (i = 0; i < bmp1->size || i < bmp2->size; i++) if (_getbit(bmp1, i) != _getbit(bmp2, i)) return 0; return 1; } /* Hamming Weight: number of set bits */ unsigned int numa_bitmask_weight(const struct bitmask *bmp) { unsigned int i; unsigned int w = 0; for (i = 0; i < bmp->size; i++) if (_getbit(bmp, i)) w++; return w; } /* *****end of bitmask_ routines ************ */ /* Next two can be overwritten by the application for different error handling */ WEAK void numa_error(char *where) { int olde = errno; perror(where); if (numa_exit_on_error) exit(1); errno = olde; } WEAK void numa_warn(int num, char *fmt, ...) { static unsigned warned; va_list ap; int olde = errno; /* Give each warning only once */ if ((1<<num) & warned) return; warned |= (1<<num); va_start(ap,fmt); fprintf(stderr, "libnuma: Warning: "); vfprintf(stderr, fmt, ap); fputc('\n', stderr); va_end(ap); errno = olde; } static void setpol(int policy, struct bitmask *bmp) { if (set_mempolicy(policy, bmp->maskp, bmp->size + 1) < 0) numa_error("set_mempolicy"); } static void getpol(int *oldpolicy, struct bitmask *bmp) { if (get_mempolicy(oldpolicy, bmp->maskp, bmp->size + 1, 0, 0) < 0) numa_error("get_mempolicy"); } static void dombind(void *mem, size_t size, int pol, struct bitmask *bmp) { if (mbind(mem, size, pol, bmp ? bmp->maskp : NULL, bmp ? bmp->size + 1 : 0, mbind_flags) < 0) numa_error("mbind"); } /* (undocumented) */ /* gives the wrong answer for hugetlbfs mappings. */ int numa_pagesize(void) { static int pagesize; if (pagesize > 0) return pagesize; pagesize = getpagesize(); return pagesize; } make_internal_alias(numa_pagesize); /* * Find nodes (numa_nodes_ptr), nodes with memory (numa_memnode_ptr) * and the highest numbered existing node (maxconfigurednode). */ static void set_configured_nodes(void) { DIR *d; struct dirent *de; long long freep; numa_memnode_ptr = numa_allocate_nodemask(); numa_nodes_ptr = numa_allocate_nodemask(); d = opendir("/sys/devices/system/node"); if (!d) { maxconfigurednode = 0; } else { while ((de = readdir(d)) != NULL) { int nd; if (strncmp(de->d_name, "node", 4)) continue; nd = strtoul(de->d_name+4, NULL, 0); numa_bitmask_setbit(numa_nodes_ptr, nd); if (numa_node_size64(nd, &freep) > 0) numa_bitmask_setbit(numa_memnode_ptr, nd); if (maxconfigurednode < nd) maxconfigurednode = nd; } closedir(d); } } /* * Convert the string length of an ascii hex mask to the number * of bits represented by that mask. */ static int s2nbits(const char *s) { return strlen(s) * 32 / 9; } /* Is string 'pre' a prefix of string 's'? */ static int strprefix(const char *s, const char *pre) { return strncmp(s, pre, strlen(pre)) == 0; } static const char *mask_size_file = "/proc/self/status"; static const char *nodemask_prefix = "Mems_allowed:\t"; /* * (do this the way Paul Jackson's libcpuset does it) * The nodemask values in /proc/self/status are in an * ascii format that uses 9 characters for each 32 bits of mask. * (this could also be used to find the cpumask size) */ static void set_nodemask_size(void) { FILE *fp; char *buf = NULL; size_t bufsize = 0; if ((fp = fopen(mask_size_file, "r")) == NULL) goto done; while (getline(&buf, &bufsize, fp) > 0) { if (strprefix(buf, nodemask_prefix)) { nodemask_sz = s2nbits(buf + strlen(nodemask_prefix)); break; } } free(buf); fclose(fp); done: if (nodemask_sz == 0) {/* fall back on error */ int pol; unsigned long *mask = NULL; nodemask_sz = 16; do { nodemask_sz <<= 1; mask = realloc(mask, nodemask_sz / 8); if (!mask) return; } while (get_mempolicy(&pol, mask, nodemask_sz + 1, 0, 0) < 0 && errno == EINVAL && nodemask_sz < 4096*8); free(mask); } } /* * Read a mask consisting of a sequence of hexadecimal longs separated by * commas. Order them correctly and return the number of bits set. */ static int read_mask(char *s, struct bitmask *bmp) { char *end = s; int tmplen = (bmp->size + bitsperint - 1) / bitsperint; unsigned int tmp[tmplen]; unsigned int *start = tmp; unsigned int i, n = 0, m = 0; if (!s) return 0; /* shouldn't happen */ i = strtoul(s, &end, 16); /* Skip leading zeros */ while (!i && *end++ == ',') { i = strtoul(end, &end, 16); } if (!i) /* End of string. No mask */ return -1; start[n++] = i; /* Read sequence of ints */ while (*end++ == ',') { i = strtoul(end, &end, 16); start[n++] = i; /* buffer overflow */ if (n > tmplen) return -1; } /* * Invert sequence of ints if necessary since the first int * is the highest and we put it first because we read it first. */ while (n) { int w; unsigned long x = 0; /* read into long values in an endian-safe way */ for (w = 0; n && w < bitsperlong; w += bitsperint) x |= ((unsigned long)start[n-- - 1] << w); bmp->maskp[m++] = x; } /* * Return the number of bits set */ return numa_bitmask_weight(bmp); } /* * Read a processes constraints in terms of nodes and cpus from * /proc/self/status. */ static void set_task_constraints(void) { int hicpu = maxconfiguredcpu; int i; char *buffer = NULL; size_t buflen = 0; FILE *f; numa_all_cpus_ptr = numa_allocate_cpumask(); numa_possible_cpus_ptr = numa_allocate_cpumask(); numa_all_nodes_ptr = numa_allocate_nodemask(); numa_possible_nodes_ptr = numa_allocate_cpumask(); numa_no_nodes_ptr = numa_allocate_nodemask(); f = fopen(mask_size_file, "r"); if (!f) { //numa_warn(W_cpumap, "Cannot parse %s", mask_size_file); return; } while (getline(&buffer, &buflen, f) > 0) { /* mask starts after [last] tab */ char *mask = strrchr(buffer,'\t') + 1; if (strncmp(buffer,"Cpus_allowed:",13) == 0) numproccpu = read_mask(mask, numa_all_cpus_ptr); if (strncmp(buffer,"Mems_allowed:",13) == 0) { numprocnode = read_mask(mask, numa_all_nodes_ptr); } } fclose(f); free(buffer); for (i = 0; i <= hicpu; i++) numa_bitmask_setbit(numa_possible_cpus_ptr, i); for (i = 0; i <= maxconfigurednode; i++) numa_bitmask_setbit(numa_possible_nodes_ptr, i); /* * Cpus_allowed in the kernel can be defined to all f's * i.e. it may be a superset of the actual available processors. * As such let's reduce numproccpu to the number of actual * available cpus. */ if (numproccpu <= 0) { for (i = 0; i <= hicpu; i++) numa_bitmask_setbit(numa_all_cpus_ptr, i); numproccpu = hicpu+1; } if (numproccpu > hicpu+1) { numproccpu = hicpu+1; for (i=hicpu+1; i<numa_all_cpus_ptr->size; i++) { numa_bitmask_clearbit(numa_all_cpus_ptr, i); } } if (numprocnode <= 0) { for (i = 0; i <= maxconfigurednode; i++) numa_bitmask_setbit(numa_all_nodes_ptr, i); numprocnode = maxconfigurednode + 1; } return; } /* * Find the highest cpu number possible (in other words the size * of a kernel cpumask_t (in bits) - 1) */ static void set_numa_max_cpu(void) { int len = 4096; int n; int olde = errno; struct bitmask *buffer; do { buffer = numa_bitmask_alloc(len); n = numa_sched_getaffinity_v2_int(0, buffer); /* on success, returns size of kernel cpumask_t, in bytes */ if (n < 0) { if (errno == EINVAL) { if (len >= 1024*1024) break; len *= 2; numa_bitmask_free(buffer); continue; } else { numa_warn(W_numcpus, "Unable to determine max cpu" " (sched_getaffinity: %s); guessing...", strerror(errno)); n = sizeof(cpu_set_t); break; } } } while (n < 0); numa_bitmask_free(buffer); errno = olde; cpumask_sz = n*8; } /* * get the total (configured) number of cpus - both online and offline */ static void set_configured_cpus(void) { maxconfiguredcpu = sysconf(_SC_NPROCESSORS_CONF) - 1; if (maxconfiguredcpu == -1) numa_error("sysconf(NPROCESSORS_CONF) failed"); } static void set_kernel_abi() { int oldp; struct bitmask *bmp, *tmp; bmp = numa_allocate_nodemask(); tmp = numa_allocate_nodemask(); if (get_mempolicy(&oldp, bmp->maskp, bmp->size + 1, 0, 0) < 0) goto out; /* Assumes there's always a node 0, and it's online */ numa_bitmask_setbit(tmp, 0); if (set_mempolicy(MPOL_PREFERRED_MANY, tmp->maskp, tmp->size) == 0) { has_preferred_many++; /* reset the old memory policy */ setpol(oldp, bmp); } out: numa_bitmask_free(tmp); numa_bitmask_free(bmp); } /* * Initialize all the sizes. */ static void set_sizes(void) { sizes_set++; set_nodemask_size(); /* size of kernel nodemask_t */ set_configured_nodes(); /* configured nodes listed in /sys */ set_numa_max_cpu(); /* size of kernel cpumask_t */ set_configured_cpus(); /* cpus listed in /sys/devices/system/cpu */ set_task_constraints(); /* cpus and nodes for current task */ set_kernel_abi(); /* man policy supported */ } int numa_num_configured_nodes(void) { /* * NOTE: this function's behavior matches the documentation (ie: it * returns a count of nodes with memory) despite the poor function * naming. We also cannot use the similarly poorly named * numa_all_nodes_ptr as it only tracks nodes with memory from which * the calling process can allocate. Think sparse nodes, memory-less * nodes, cpusets... */ int memnodecount=0, i; for (i=0; i <= maxconfigurednode; i++) { if (numa_bitmask_isbitset(numa_memnode_ptr, i)) memnodecount++; } return memnodecount; } int numa_num_configured_cpus(void) { return maxconfiguredcpu+1; } int numa_num_possible_nodes(void) { return nodemask_sz; } int numa_num_possible_cpus(void) { return cpumask_sz; } int numa_num_task_nodes(void) { return numprocnode; } /* * for backward compatibility */ int numa_num_thread_nodes(void) { return numa_num_task_nodes(); } int numa_num_task_cpus(void) { return numproccpu; } /* * for backward compatibility */ int numa_num_thread_cpus(void) { return numa_num_task_cpus(); } /* * Return the number of the highest node in this running system, */ int numa_max_node(void) { return maxconfigurednode; } make_internal_alias(numa_max_node); /* * Return the number of the highest possible node in a system, * which for v1 is the size of a numa.h nodemask_t(in bits)-1. * but for v2 is the size of a kernel nodemask_t(in bits)-1. */ SYMVER("numa_max_possible_node_v1", "numa_max_possible_node@libnuma_1.1") int numa_max_possible_node_v1(void) { return ((sizeof(nodemask_t)*8)-1); } SYMVER("numa_max_possible_node_v2", "numa_max_possible_node@@libnuma_1.2") int numa_max_possible_node_v2(void) { return numa_num_possible_nodes()-1; } make_internal_alias(numa_max_possible_node_v1); make_internal_alias(numa_max_possible_node_v2); /* * Allocate a bitmask for cpus, of a size large enough to * match the kernel's cpumask_t. */ struct bitmask * numa_allocate_cpumask() { int ncpus = numa_num_possible_cpus(); return numa_bitmask_alloc(ncpus); } /* * Allocate a bitmask the size of a libnuma nodemask_t */ static struct bitmask * allocate_nodemask_v1(void) { int nnodes = numa_max_possible_node_v1_int()+1; return numa_bitmask_alloc(nnodes); } /* * Allocate a bitmask for nodes, of a size large enough to * match the kernel's nodemask_t. */ struct bitmask * numa_allocate_nodemask(void) { struct bitmask *bmp; int nnodes = numa_max_possible_node_v2_int() + 1; bmp = numa_bitmask_alloc(nnodes); return bmp; } /* (cache the result?) */ long long numa_node_size64(int node, long long *freep) { size_t len = 0; char *line = NULL; long long size = -1; FILE *f; char fn[64]; int ok = 0; int required = freep ? 2 : 1; if (freep) *freep = -1; sprintf(fn,"/sys/devices/system/node/node%d/meminfo", node); f = fopen(fn, "r"); if (!f) return -1; while (getdelim(&line, &len, '\n', f) > 0) { char *end; char *s = strcasestr(line, "kB"); if (!s) continue; --s; while (s > line && isspace(*s)) --s; while (s > line && isdigit(*s)) --s; if (strstr(line, "MemTotal")) { size = strtoull(s,&end,0) << 10; if (end == s) size = -1; else ok++; } if (freep && strstr(line, "MemFree")) { *freep = strtoull(s,&end,0) << 10; if (end == s) *freep = -1; else ok++; } } fclose(f); free(line); if (ok != required) numa_warn(W_badmeminfo, "Cannot parse sysfs meminfo (%d)", ok); return size; } make_internal_alias(numa_node_size64); long long numa_node_size(int node, long long *freep) { long long f2; long long sz = numa_node_size64_int(node, &f2); if (freep) *freep = f2; return sz; } int numa_available(void) { if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS) return -1; return 0; } SYMVER("numa_interleave_memory_v1", "numa_interleave_memory@libnuma_1.1") void numa_interleave_memory_v1(void *mem, size_t size, const nodemask_t *mask) { struct bitmask bitmask; bitmask.size = sizeof(nodemask_t) * 8; bitmask.maskp = (unsigned long *)mask; dombind(mem, size, MPOL_INTERLEAVE, &bitmask); } SYMVER("numa_interleave_memory_v2", "numa_interleave_memory@@libnuma_1.2") void numa_interleave_memory_v2(void *mem, size_t size, struct bitmask *bmp) { dombind(mem, size, MPOL_INTERLEAVE, bmp); } void numa_tonode_memory(void *mem, size_t size, int node) { struct bitmask *nodes; nodes = numa_allocate_nodemask(); numa_bitmask_setbit(nodes, node); dombind(mem, size, bind_policy, nodes); numa_bitmask_free(nodes); } SYMVER("numa_tonodemask_memory_v1", "numa_tonodemask_memory@libnuma_1.1") void numa_tonodemask_memory_v1(void *mem, size_t size, const nodemask_t *mask) { struct bitmask bitmask; bitmask.maskp = (unsigned long *)mask; bitmask.size = sizeof(nodemask_t); dombind(mem, size, bind_policy, &bitmask); } SYMVER("numa_tonodemask_memory_v2", "numa_tonodemask_memory@@libnuma_1.2") void numa_tonodemask_memory_v2(void *mem, size_t size, struct bitmask *bmp) { dombind(mem, size, bind_policy, bmp); } void numa_setlocal_memory(void *mem, size_t size) { dombind(mem, size, MPOL_LOCAL, NULL); } void numa_police_memory(void *mem, size_t size) { int pagesize = numa_pagesize_int(); unsigned long i; char *p = mem; for (i = 0; i < size; i += pagesize, p += pagesize) __atomic_and_fetch(p, 0xff, __ATOMIC_RELAXED); } make_internal_alias(numa_police_memory); void *numa_alloc(size_t size) { char *mem; mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (mem == (char *)-1) return NULL; numa_police_memory_int(mem, size); return mem; } void *numa_realloc(void *old_addr, size_t old_size, size_t new_size) { char *mem; mem = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE); if (mem == (char *)-1) return NULL; /* * The memory policy of the allocated pages is preserved by mremap(), so * there is no need to (re)set it here. If the policy of the original * allocation is not set, the new pages will be allocated according to the * process' mempolicy. Trying to allocate explicitly the new pages on the * same node as the original ones would require changing the policy of the * newly allocated pages, which violates the numa_realloc() semantics. */ return mem; } SYMVER("numa_alloc_interleaved_subset_v1", "numa_alloc_interleaved_subset@libnuma_1.1") void *numa_alloc_interleaved_subset_v1(size_t size, const nodemask_t *mask) { char *mem; struct bitmask bitmask; mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (mem == (char *)-1) return NULL; bitmask.maskp = (unsigned long *)mask; bitmask.size = sizeof(nodemask_t); dombind(mem, size, MPOL_INTERLEAVE, &bitmask); return mem; } SYMVER("numa_alloc_interleaved_subset_v2", "numa_alloc_interleaved_subset@@libnuma_1.2") void *numa_alloc_interleaved_subset_v2(size_t size, struct bitmask *bmp) { char *mem; mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (mem == (char *)-1) return NULL; dombind(mem, size, MPOL_INTERLEAVE, bmp); return mem; } make_internal_alias(numa_alloc_interleaved_subset_v1); make_internal_alias(numa_alloc_interleaved_subset_v2); void * numa_alloc_interleaved(size_t size) { return numa_alloc_interleaved_subset_v2_int(size, numa_all_nodes_ptr); } /* * given a user node mask, set memory policy to use those nodes */ SYMVER("numa_set_interleave_mask_v1", "numa_set_interleave_mask@libnuma_1.1") void numa_set_interleave_mask_v1(nodemask_t *mask) { struct bitmask *bmp; int nnodes = numa_max_possible_node_v1_int()+1; bmp = numa_bitmask_alloc(nnodes); copy_nodemask_to_bitmask(mask, bmp); if (numa_bitmask_equal(bmp, numa_no_nodes_ptr)) setpol(MPOL_DEFAULT, bmp); else setpol(MPOL_INTERLEAVE, bmp); numa_bitmask_free(bmp); } SYMVER("numa_set_interleave_mask_v2", "numa_set_interleave_mask@@libnuma_1.2") void numa_set_interleave_mask_v2(struct bitmask *bmp) { if (numa_bitmask_equal(bmp, numa_no_nodes_ptr)) setpol(MPOL_DEFAULT, bmp); else setpol(MPOL_INTERLEAVE, bmp); } SYMVER("numa_get_interleave_mask_v1", "numa_get_interleave_mask@libnuma_1.1") nodemask_t numa_get_interleave_mask_v1(void) { int oldpolicy; struct bitmask *bmp; nodemask_t mask; bmp = allocate_nodemask_v1(); getpol(&oldpolicy, bmp); if (oldpolicy == MPOL_INTERLEAVE) copy_bitmask_to_nodemask(bmp, &mask); else copy_bitmask_to_nodemask(numa_no_nodes_ptr, &mask); numa_bitmask_free(bmp); return mask; } SYMVER("numa_get_interleave_mask_v2", "numa_get_interleave_mask@@libnuma_1.2") struct bitmask * numa_get_interleave_mask_v2(void) { int oldpolicy; struct bitmask *bmp; bmp = numa_allocate_nodemask(); getpol(&oldpolicy, bmp); if (oldpolicy != MPOL_INTERLEAVE) copy_bitmask_to_bitmask(numa_no_nodes_ptr, bmp); return bmp; } /* (undocumented) */ int numa_get_interleave_node(void) { int nd; if (get_mempolicy(&nd, NULL, 0, 0, MPOL_F_NODE) == 0) return nd; return 0; } void *numa_alloc_onnode(size_t size, int node) { char *mem; struct bitmask *bmp; bmp = numa_allocate_nodemask(); numa_bitmask_setbit(bmp, node); mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (mem == (char *)-1) mem = NULL; else dombind(mem, size, bind_policy, bmp); numa_bitmask_free(bmp); return mem; } void *numa_alloc_local(size_t size) { char *mem; mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (mem == (char *)-1) mem = NULL; else dombind(mem, size, MPOL_LOCAL, NULL); return mem; } void numa_set_bind_policy(int strict) { if (strict) bind_policy = MPOL_BIND; else if (has_preferred_many) bind_policy = MPOL_PREFERRED_MANY; else bind_policy = MPOL_PREFERRED; } SYMVER("numa_set_membind_v1", "numa_set_membind@libnuma_1.1") void numa_set_membind_v1(const nodemask_t *mask) { struct bitmask bitmask; bitmask.maskp = (unsigned long *)mask; bitmask.size = sizeof(nodemask_t); setpol(MPOL_BIND, &bitmask); } SYMVER("numa_set_membind_v2", "numa_set_membind@@libnuma_1.2") void numa_set_membind_v2(struct bitmask *bmp) { setpol(MPOL_BIND, bmp); } make_internal_alias(numa_set_membind_v2); void numa_set_membind_balancing(struct bitmask *bmp) { /* MPOL_F_NUMA_BALANCING: ignore if unsupported */ if (set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1) < 0) { if (errno == EINVAL) { errno = 0; numa_set_membind_v2(bmp); } else numa_error("set_mempolicy"); } } /* * copy a bitmask map body to a numa.h nodemask_t structure */ void copy_bitmask_to_nodemask(struct bitmask *bmp, nodemask_t *nmp) { int max, i; memset(nmp, 0, sizeof(nodemask_t)); max = (sizeof(nodemask_t)*8); for (i=0; i<bmp->size; i++) { if (i >= max) break; if (numa_bitmask_isbitset(bmp, i)) nodemask_set_compat((nodemask_t *)nmp, i); } } /* * copy a bitmask map body to another bitmask body * fill a larger destination with zeroes */ void copy_bitmask_to_bitmask(struct bitmask *bmpfrom, struct bitmask *bmpto) { int bytes; if (bmpfrom->size >= bmpto->size) { memcpy(bmpto->maskp, bmpfrom->maskp, CPU_BYTES(bmpto->size)); } else if (bmpfrom->size < bmpto->size) { bytes = CPU_BYTES(bmpfrom->size); memcpy(bmpto->maskp, bmpfrom->maskp, bytes); memset(((char *)bmpto->maskp)+bytes, 0, CPU_BYTES(bmpto->size)-bytes); } } /* * copy a numa.h nodemask_t structure to a bitmask map body */ void copy_nodemask_to_bitmask(nodemask_t *nmp, struct bitmask *bmp) { int max, i; numa_bitmask_clearall(bmp); max = (sizeof(nodemask_t)*8); if (max > bmp->size) max = bmp->size; for (i=0; i<max; i++) { if (nodemask_isset_compat(nmp, i)) numa_bitmask_setbit(bmp, i); } } SYMVER("numa_get_membind_v1", "numa_get_membind@libnuma_1.1") nodemask_t numa_get_membind_v1(void) { int oldpolicy; struct bitmask *bmp; nodemask_t nmp; bmp = allocate_nodemask_v1(); getpol(&oldpolicy, bmp); if (oldpolicy == MPOL_BIND) { copy_bitmask_to_nodemask(bmp, &nmp); } else { /* copy the body of the map to numa_all_nodes */ copy_bitmask_to_nodemask(bmp, &numa_all_nodes); nmp = numa_all_nodes; } numa_bitmask_free(bmp); return nmp; } SYMVER("numa_get_membind_v2", "numa_get_membind@@libnuma_1.2") struct bitmask * numa_get_membind_v2(void) { int oldpolicy; struct bitmask *bmp; bmp = numa_allocate_nodemask(); getpol(&oldpolicy, bmp); if (oldpolicy != MPOL_BIND) copy_bitmask_to_bitmask(numa_all_nodes_ptr, bmp); return bmp; } //TODO: do we need a v1 nodemask_t version? struct bitmask *numa_get_mems_allowed(void) { struct bitmask *bmp; /* * can change, so query on each call. */ bmp = numa_allocate_nodemask(); if (get_mempolicy(NULL, bmp->maskp, bmp->size + 1, 0, MPOL_F_MEMS_ALLOWED) < 0) numa_error("get_mempolicy"); return bmp; } make_internal_alias(numa_get_mems_allowed); void numa_free(void *mem, size_t size) { munmap(mem, size); } SYMVER("numa_parse_bitmap_v1", "numa_parse_bitmap@libnuma_1.1") int numa_parse_bitmap_v1(char *line, unsigned long *mask, int ncpus) { int i; char *p = strchr(line, '\n'); if (!p) return -1; for (i = 0; p > line;i++) { char *oldp, *endp; oldp = p; if (*p == ',') --p; while (p > line && *p != ',') --p; /* Eat two 32bit fields at a time to get longs */ if (p > line && sizeof(unsigned long) == 8) { oldp--; memmove(p, p+1, oldp-p+1); while (p > line && *p != ',') --p; } if (*p == ',') p++; if (i >= CPU_LONGS(ncpus)) return -1; mask[i] = strtoul(p, &endp, 16); if (endp != oldp) return -1; p--; } return 0; } SYMVER("numa_parse_bitmap_v2", "numa_parse_bitmap@@libnuma_1.2") int numa_parse_bitmap_v2(char *line, struct bitmask *mask) { int i, ncpus; char *p = strchr(line, '\n'); if (!p) return -1; ncpus = mask->size; for (i = 0; p > line;i++) { char *oldp, *endp; oldp = p; if (*p == ',') --p; while (p > line && *p != ',') --p; /* Eat two 32bit fields at a time to get longs */ if (p > line && sizeof(unsigned long) == 8) { oldp--; memmove(p, p+1, oldp-p+1); while (p > line && *p != ',') --p; } if (*p == ',') p++; if (i >= CPU_LONGS(ncpus)) return -1; mask->maskp[i] = strtoul(p, &endp, 16); if (endp != oldp) return -1; p--; } return 0; } static void init_node_cpu_mask_v2(void) { int nnodes = numa_max_possible_node_v2_int() + 1; node_cpu_mask_v2 = calloc (nnodes, sizeof(struct bitmask *)); } static void cleanup_node_cpu_mask_v2(void) { if (node_cpu_mask_v2) { int i; int nnodes; nnodes = numa_max_possible_node_v2_int() + 1; for (i = 0; i < nnodes; i++) { FREE_AND_ZERO(node_cpu_mask_v2[i]); } free(node_cpu_mask_v2); node_cpu_mask_v2 = NULL; } } /* This would be better with some locking, but I don't want to make libnuma dependent on pthreads right now. The races are relatively harmless. */ SYMVER("numa_node_to_cpus_v1", "numa_node_to_cpus@libnuma_1.1") int numa_node_to_cpus_v1(int node, unsigned long *buffer, int bufferlen) { int err = 0; char fn[64]; FILE *f; char update; char *line = NULL; size_t len = 0; struct bitmask bitmask; int buflen_needed; unsigned long *mask; int ncpus = numa_num_possible_cpus(); int maxnode = numa_max_node_int(); buflen_needed = CPU_BYTES(ncpus); if ((unsigned)node > maxnode || bufferlen < buflen_needed) { errno = ERANGE; return -1; } if (bufferlen > buflen_needed) memset(buffer, 0, bufferlen); update = __atomic_fetch_and(&node_cpu_mask_v1_stale, 0, __ATOMIC_RELAXED); if (node_cpu_mask_v1[node] && !update) { memcpy(buffer, node_cpu_mask_v1[node], buflen_needed); return 0; } mask = malloc(buflen_needed); if (!mask) mask = (unsigned long *)buffer; memset(mask, 0, buflen_needed); sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node); f = fopen(fn, "r"); if (!f || getdelim(&line, &len, '\n', f) < 1) { if (numa_bitmask_isbitset(numa_nodes_ptr, node)) { numa_warn(W_nosysfs2, "/sys not mounted or invalid. Assuming one node: %s", strerror(errno)); numa_warn(W_nosysfs2, "(cannot open or correctly parse %s)", fn); } bitmask.maskp = (unsigned long *)mask; bitmask.size = buflen_needed * 8; numa_bitmask_setall(&bitmask); err = -1; } if (f) fclose(f); if (line && (numa_parse_bitmap_v1(line, mask, ncpus) < 0)) { numa_warn(W_cpumap, "Cannot parse cpumap. Assuming one node"); bitmask.maskp = (unsigned long *)mask; bitmask.size = buflen_needed * 8; numa_bitmask_setall(&bitmask); err = -1; } free(line); memcpy(buffer, mask, buflen_needed); /* slightly racy, see above */ if (node_cpu_mask_v1[node]) { if (update) { /* * There may be readers on node_cpu_mask_v1[], hence it can not * be freed. */ memcpy(node_cpu_mask_v1[node], mask, buflen_needed); free(mask); mask = NULL; } else if (mask != buffer) free(mask); } else { node_cpu_mask_v1[node] = mask; } return err; } /* * test whether a node has cpus */ /* This would be better with some locking, but I don't want to make libnuma dependent on pthreads right now. The races are relatively harmless. */ /* * deliver a bitmask of cpus representing the cpus on a given node */ SYMVER("numa_node_to_cpus_v2", "numa_node_to_cpus@@libnuma_1.2") int numa_node_to_cpus_v2(int node, struct bitmask *buffer) { int err = 0; int nnodes = numa_max_node(); char fn[64], *line = NULL; FILE *f; char update; size_t len = 0; struct bitmask *mask; if (!node_cpu_mask_v2) init_node_cpu_mask_v2(); if (node > nnodes) { errno = ERANGE; return -1; } numa_bitmask_clearall(buffer); update = __atomic_fetch_and(&node_cpu_mask_v2_stale, 0, __ATOMIC_RELAXED); if (node_cpu_mask_v2[node] && !update) { /* have already constructed a mask for this node */ if (buffer->size < node_cpu_mask_v2[node]->size) { errno = EINVAL; numa_error("map size mismatch"); return -1; } copy_bitmask_to_bitmask(node_cpu_mask_v2[node], buffer); return 0; } /* need a new mask for this node */ mask = numa_allocate_cpumask(); /* this is a kernel cpumask_t (see node_read_cpumap()) */ sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node); f = fopen(fn, "r"); if (!f || getdelim(&line, &len, '\n', f) < 1) { if (numa_bitmask_isbitset(numa_nodes_ptr, node)) { numa_warn(W_nosysfs2, "/sys not mounted or invalid. Assuming one node: %s", strerror(errno)); numa_warn(W_nosysfs2, "(cannot open or correctly parse %s)", fn); } numa_bitmask_setall(mask); err = -1; } if (f) fclose(f); if (line && (numa_parse_bitmap_v2(line, mask) < 0)) { numa_warn(W_cpumap, "Cannot parse cpumap. Assuming one node"); numa_bitmask_setall(mask); err = -1; } free(line); copy_bitmask_to_bitmask(mask, buffer); /* slightly racy, see above */ /* save the mask we created */ if (node_cpu_mask_v2[node]) { if (update) { copy_bitmask_to_bitmask(mask, node_cpu_mask_v2[node]); numa_bitmask_free(mask); mask = NULL; /* how could this be? */ } else if (mask != buffer) numa_bitmask_free(mask); } else { /* we don't want to cache faulty result */ if (!err) node_cpu_mask_v2[node] = mask; else numa_bitmask_free(mask); } return err; } make_internal_alias(numa_node_to_cpus_v1); make_internal_alias(numa_node_to_cpus_v2); void numa_node_to_cpu_update(void) { __atomic_store_n(&node_cpu_mask_v1_stale, 1, __ATOMIC_RELAXED); __atomic_store_n(&node_cpu_mask_v2_stale, 1, __ATOMIC_RELAXED); } /* report the node of the specified cpu */ int numa_node_of_cpu(int cpu) { struct bitmask *bmp; int ncpus, nnodes, node, ret; ncpus = numa_num_possible_cpus(); if (cpu > ncpus){ errno = EINVAL; return -1; } bmp = numa_bitmask_alloc(ncpus); nnodes = numa_max_node(); for (node = 0; node <= nnodes; node++){ if (numa_node_to_cpus_v2_int(node, bmp) < 0) { /* It's possible for the node to not exist */ continue; } if (numa_bitmask_isbitset(bmp, cpu)){ ret = node; goto end; } } ret = -1; errno = EINVAL; end: numa_bitmask_free(bmp); return ret; } SYMVER("numa_run_on_node_mask_v1", "numa_run_on_node_mask@libnuma_1.1") int numa_run_on_node_mask_v1(const nodemask_t *mask) { int ncpus = numa_num_possible_cpus(); int i, k, err; unsigned long cpus[CPU_LONGS(ncpus)], nodecpus[CPU_LONGS(ncpus)]; memset(cpus, 0, CPU_BYTES(ncpus)); for (i = 0; i < NUMA_NUM_NODES; i++) { if (mask->n[i / BITS_PER_LONG] == 0) continue; if (nodemask_isset_compat(mask, i)) { if (numa_node_to_cpus_v1_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) { numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs"); continue; } for (k = 0; k < CPU_LONGS(ncpus); k++) cpus[k] |= nodecpus[k]; } } err = numa_sched_setaffinity_v1(0, CPU_BYTES(ncpus), cpus); /* The sched_setaffinity API is broken because it expects the user to guess the kernel cpuset size. Do this in a brute force way. */ if (err < 0 && errno == EINVAL) { int savederrno = errno; char *bigbuf; static int size = -1; if (size == -1) size = CPU_BYTES(ncpus) * 2; bigbuf = malloc(CPU_BUFFER_SIZE); if (!bigbuf) { errno = ENOMEM; return -1; } errno = savederrno; while (size <= CPU_BUFFER_SIZE) { memcpy(bigbuf, cpus, CPU_BYTES(ncpus)); memset(bigbuf + CPU_BYTES(ncpus), 0, CPU_BUFFER_SIZE - CPU_BYTES(ncpus)); err = numa_sched_setaffinity_v1_int(0, size, (unsigned long *)bigbuf); if (err == 0 || errno != EINVAL) break; size *= 2; } savederrno = errno; free(bigbuf); errno = savederrno; } return err; } /* * Given a node mask (size of a kernel nodemask_t) (probably populated by * a user argument list) set up a map of cpus (map "cpus") on those nodes. * Then set affinity to those cpus. */ SYMVER("numa_run_on_node_mask_v2", "numa_run_on_node_mask@@libnuma_1.2") int numa_run_on_node_mask_v2(struct bitmask *bmp) { int ncpus, i, k, err; struct bitmask *cpus, *nodecpus; cpus = numa_allocate_cpumask(); ncpus = cpus->size; nodecpus = numa_allocate_cpumask(); for (i = 0; i < bmp->size; i++) { if (bmp->maskp[i / BITS_PER_LONG] == 0) continue; if (numa_bitmask_isbitset(bmp, i)) { /* * numa_all_nodes_ptr is cpuset aware; use only * these nodes */ if (!numa_bitmask_isbitset(numa_all_nodes_ptr, i)) { numa_warn(W_noderunmask, "node %d not allowed", i); continue; } if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) { numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs"); continue; } for (k = 0; k < CPU_LONGS(ncpus); k++) cpus->maskp[k] |= nodecpus->maskp[k]; } } err = numa_sched_setaffinity_v2_int(0, cpus); numa_bitmask_free(cpus); numa_bitmask_free(nodecpus); /* used to have to consider that this could fail - it shouldn't now */ if (err < 0) { numa_error("numa_sched_setaffinity_v2_int() failed"); } return err; } make_internal_alias(numa_run_on_node_mask_v2); /* * Given a node mask (size of a kernel nodemask_t) (probably populated by * a user argument list) set up a map of cpus (map "cpus") on those nodes * without any cpuset awareness. Then set affinity to those cpus. */ int numa_run_on_node_mask_all(struct bitmask *bmp) { int ncpus, i, k, err; struct bitmask *cpus, *nodecpus; cpus = numa_allocate_cpumask(); ncpus = cpus->size; nodecpus = numa_allocate_cpumask(); for (i = 0; i < bmp->size; i++) { if (bmp->maskp[i / BITS_PER_LONG] == 0) continue; if (numa_bitmask_isbitset(bmp, i)) { if (!numa_bitmask_isbitset(numa_possible_nodes_ptr, i)) { numa_warn(W_noderunmask, "node %d not allowed", i); continue; } if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) { numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs"); continue; } for (k = 0; k < CPU_LONGS(ncpus); k++) cpus->maskp[k] |= nodecpus->maskp[k]; } } err = numa_sched_setaffinity_v2_int(0, cpus); numa_bitmask_free(cpus); numa_bitmask_free(nodecpus); /* With possible nodes freedom it can happen easily now */ if (err < 0) { numa_error("numa_sched_setaffinity_v2_int() failed"); } return err; } SYMVER("numa_get_run_node_mask_v1", "numa_get_run_node_mask@libnuma_1.1") nodemask_t numa_get_run_node_mask_v1(void) { int ncpus = numa_num_configured_cpus(); int i, k; int max = numa_max_node_int(); struct bitmask *bmp, *cpus, *nodecpus; nodemask_t nmp; cpus = numa_allocate_cpumask(); if (numa_sched_getaffinity_v2_int(0, cpus) < 0){ nmp = numa_no_nodes; goto free_cpus; } nodecpus = numa_allocate_cpumask(); bmp = allocate_nodemask_v1(); /* the size of a nodemask_t */ for (i = 0; i <= max; i++) { if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) { /* It's possible for the node to not exist */ continue; } for (k = 0; k < CPU_LONGS(ncpus); k++) { if (nodecpus->maskp[k] & cpus->maskp[k]) numa_bitmask_setbit(bmp, i); } } copy_bitmask_to_nodemask(bmp, &nmp); numa_bitmask_free(bmp); numa_bitmask_free(nodecpus); free_cpus: numa_bitmask_free(cpus); return nmp; } SYMVER("numa_get_run_node_mask_v2", "numa_get_run_node_mask@@libnuma_1.2") struct bitmask * numa_get_run_node_mask_v2(void) { int i, k; int ncpus = numa_num_configured_cpus(); int max = numa_max_node_int(); struct bitmask *bmp, *cpus, *nodecpus; bmp = numa_allocate_cpumask(); cpus = numa_allocate_cpumask(); if (numa_sched_getaffinity_v2_int(0, cpus) < 0){ copy_bitmask_to_bitmask(numa_no_nodes_ptr, bmp); goto free_cpus; } nodecpus = numa_allocate_cpumask(); for (i = 0; i <= max; i++) { /* * numa_all_nodes_ptr is cpuset aware; show only * these nodes */ if (!numa_bitmask_isbitset(numa_all_nodes_ptr, i)) { continue; } if (numa_node_to_cpus_v2_int(i, nodecpus) < 0) { /* It's possible for the node to not exist */ continue; } for (k = 0; k < CPU_LONGS(ncpus); k++) { if (nodecpus->maskp[k] & cpus->maskp[k]) numa_bitmask_setbit(bmp, i); } } numa_bitmask_free(nodecpus); free_cpus: numa_bitmask_free(cpus); return bmp; } int numa_migrate_pages(int pid, struct bitmask *fromnodes, struct bitmask *tonodes) { int numa_num_nodes = numa_num_possible_nodes(); return migrate_pages(pid, numa_num_nodes + 1, fromnodes->maskp, tonodes->maskp); } int numa_move_pages(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags) { return move_pages(pid, count, pages, nodes, status, flags); } int numa_run_on_node(int node) { int numa_num_nodes = numa_num_possible_nodes(); int ret = -1; struct bitmask *cpus; if (node >= numa_num_nodes){ errno = EINVAL; goto out; } cpus = numa_allocate_cpumask(); if (node == -1) numa_bitmask_setall(cpus); else if (numa_node_to_cpus_v2_int(node, cpus) < 0){ numa_warn(W_noderunmask, "Cannot read node cpumask from sysfs"); goto free; } ret = numa_sched_setaffinity_v2_int(0, cpus); free: numa_bitmask_free(cpus); out: return ret; } static struct bitmask *__numa_preferred(void) { int policy; struct bitmask *bmp; bmp = numa_allocate_nodemask(); /* could read the current CPU from /proc/self/status. Probably not worth it. */ numa_bitmask_clearall(bmp); getpol(&policy, bmp); if (policy != MPOL_PREFERRED && policy != MPOL_PREFERRED_MANY && policy != MPOL_BIND) return bmp; if (numa_bitmask_weight(bmp) > 1) numa_error(__FILE__); return bmp; } int numa_preferred(void) { return numa_find_first(__numa_preferred()); } static void __numa_set_preferred(struct bitmask *bmp) { int nodes = numa_bitmask_weight(bmp); if (nodes > 1) numa_error(__FILE__); setpol(nodes ? MPOL_PREFERRED : MPOL_LOCAL, bmp); } void numa_set_preferred(int node) { struct bitmask *bmp = numa_allocate_nodemask(); numa_bitmask_setbit(bmp, node); __numa_set_preferred(bmp); numa_bitmask_free(bmp); } int numa_has_preferred_many(void) { return has_preferred_many; } void numa_set_preferred_many(struct bitmask *bitmask) { if (!has_preferred_many) { numa_error("Unable to handle MANY preferred nodes. Falling back to first node\n"); __numa_set_preferred(bitmask); } setpol(MPOL_PREFERRED_MANY, bitmask); } struct bitmask *numa_preferred_many() { return __numa_preferred(); } void numa_set_localalloc(void) { setpol(MPOL_LOCAL, numa_no_nodes_ptr); } SYMVER("numa_bind_v1", "numa_bind@libnuma_1.1") void numa_bind_v1(const nodemask_t *nodemask) { struct bitmask bitmask; bitmask.maskp = (unsigned long *)nodemask; bitmask.size = sizeof(nodemask_t); numa_run_on_node_mask_v2_int(&bitmask); numa_set_membind_v2_int(&bitmask); } SYMVER("numa_bind_v2", "numa_bind@@libnuma_1.2") void numa_bind_v2(struct bitmask *bmp) { numa_run_on_node_mask_v2_int(bmp); numa_set_membind_v2_int(bmp); } void numa_set_strict(int flag) { if (flag) mbind_flags |= MPOL_MF_STRICT; else mbind_flags &= ~MPOL_MF_STRICT; } /* * Extract a node or processor number from the given string. * Allow a relative node / processor specification within the allowed * set if "relative" is nonzero */ static unsigned long get_nr(const char *s, char **end, struct bitmask *bmp, int relative) { long i, nr; if (!relative) return strtoul(s, end, 0); nr = strtoul(s, end, 0); if (s == *end) return nr; /* Find the nth set bit */ for (i = 0; nr >= 0 && i <= bmp->size; i++) if (numa_bitmask_isbitset(bmp, i)) nr--; return i-1; } /* * __numa_parse_nodestring() is called to create a node mask, given * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10. * (the + indicates that the numbers are nodeset-relative) * * The nodes may be specified as absolute, or relative to the current nodeset. * The list of available nodes is in a map pointed to by "allowed_nodes_ptr", * which may represent all nodes or the nodes in the current nodeset. * * The caller must free the returned bitmask. */ static struct bitmask * __numa_parse_nodestring(const char *s, struct bitmask *allowed_nodes_ptr) { int invert = 0, relative = 0; int conf_nodes = numa_num_configured_nodes(); char *end; struct bitmask *mask; mask = numa_allocate_nodemask(); if (s[0] == 0){ copy_bitmask_to_bitmask(numa_no_nodes_ptr, mask); return mask; /* return freeable mask */ } if (*s == '!') { invert = 1; s++; } if (*s == '+') { relative++; s++; } do { unsigned long arg; int i; if (isalpha(*s)) { int n; if (!strcmp(s,"all")) { copy_bitmask_to_bitmask(allowed_nodes_ptr, mask); s+=4; break; } n = resolve_affinity(s, mask); if (n != NO_IO_AFFINITY) { if (n < 0) goto err; s += strlen(s) + 1; break; } } arg = get_nr(s, &end, allowed_nodes_ptr, relative); if (end == s) { numa_warn(W_nodeparse, "unparseable node description `%s'\n", s); goto err; } if (!numa_bitmask_isbitset(allowed_nodes_ptr, arg)) { numa_warn(W_nodeparse, "node argument %d is out of range\n", arg); goto err; } i = arg; numa_bitmask_setbit(mask, i); s = end; if (*s == '-') { char *end2; unsigned long arg2; arg2 = get_nr(++s, &end2, allowed_nodes_ptr, relative); if (end2 == s) { numa_warn(W_nodeparse, "missing node argument %s\n", s); goto err; } if (!numa_bitmask_isbitset(allowed_nodes_ptr, arg2)) { numa_warn(W_nodeparse, "node argument %d out of range\n", arg2); goto err; } while (arg <= arg2) { i = arg; if (numa_bitmask_isbitset(allowed_nodes_ptr,i)) numa_bitmask_setbit(mask, i); arg++; } s = end2; } } while (*s++ == ','); if (s[-1] != '\0') goto err; if (invert) { int i; for (i = 0; i < conf_nodes; i++) { if (numa_bitmask_isbitset(mask, i)) numa_bitmask_clearbit(mask, i); else numa_bitmask_setbit(mask, i); } } return mask; err: numa_bitmask_free(mask); return NULL; } /* * numa_parse_nodestring() is called to create a bitmask from nodes available * for this task. */ struct bitmask * numa_parse_nodestring(const char *s) { return __numa_parse_nodestring(s, numa_all_nodes_ptr); } /* * numa_parse_nodestring_all() is called to create a bitmask from all nodes * available. */ struct bitmask * numa_parse_nodestring_all(const char *s) { return __numa_parse_nodestring(s, numa_possible_nodes_ptr); } /* * __numa_parse_cpustring() is called to create a bitmask, given * an ascii string such as 25 or 12-15 or 1,3,5-7 or +6-10. * (the + indicates that the numbers are cpuset-relative) * * The cpus may be specified as absolute, or relative to the current cpuset. * The list of available cpus for this task is in the map pointed to by * "allowed_cpus_ptr", which may represent all cpus or the cpus in the * current cpuset. * * The caller must free the returned bitmask. */ static struct bitmask * __numa_parse_cpustring(const char *s, struct bitmask *allowed_cpus_ptr) { int invert = 0, relative=0; int conf_cpus = numa_num_configured_cpus(); char *end; struct bitmask *mask; int i; mask = numa_allocate_cpumask(); if (s[0] == 0) return mask; if (*s == '!') { invert = 1; s++; } if (*s == '+') { relative++; s++; } do { unsigned long arg; if (!strcmp(s,"all")) { copy_bitmask_to_bitmask(allowed_cpus_ptr, mask); s+=4; break; } arg = get_nr(s, &end, allowed_cpus_ptr, relative); if (end == s) { numa_warn(W_cpuparse, "unparseable cpu description `%s'\n", s); goto err; } if (!numa_bitmask_isbitset(allowed_cpus_ptr, arg)) { numa_warn(W_cpuparse, "cpu argument %s is out of range\n", s); goto err; } i = arg; numa_bitmask_setbit(mask, i); s = end; if (*s == '-') { char *end2; unsigned long arg2; arg2 = get_nr(++s, &end2, allowed_cpus_ptr, relative); if (end2 == s) { numa_warn(W_cpuparse, "missing cpu argument %s\n", s); goto err; } if (!numa_bitmask_isbitset(allowed_cpus_ptr, arg2)) { numa_warn(W_cpuparse, "cpu argument %s out of range\n", s); goto err; } while (arg <= arg2) { i = arg; if (numa_bitmask_isbitset(allowed_cpus_ptr, i)) numa_bitmask_setbit(mask, i); arg++; } s = end2; } } while (*s++ == ','); if (s[-1] != '\0') goto err; if (invert) { for (i = 0; i < conf_cpus; i++) { if (numa_bitmask_isbitset(mask, i)) numa_bitmask_clearbit(mask, i); else numa_bitmask_setbit(mask, i); } } return mask; err: numa_bitmask_free(mask); return NULL; } /* * numa_parse_cpustring() is called to create a bitmask from cpus available * for this task. */ struct bitmask * numa_parse_cpustring(const char *s) { return __numa_parse_cpustring(s, numa_all_cpus_ptr); } /* * numa_parse_cpustring_all() is called to create a bitmask from all cpus * available. */ struct bitmask * numa_parse_cpustring_all(const char *s) { return __numa_parse_cpustring(s, numa_possible_cpus_ptr); } 07070100000014000041ED0000000000000000000000016319106A00000000000000000000000000000000000000000000001D00000000numactl- m4 files generated by libtoolize: libtool.m4 ltoptions.m4 ltsugar.m4 ltversion.m4 lt~obsolete.m4 07070100000016000081A40000000000000000000000016319106A000014DA000000000000000000000000000000000000003300000000numactl- =========================================================================== # # =========================================================================== # # SYNOPSIS # # AX_AM_OVERRIDE_VAR([varname1 varname ... ]) # AX_AM_OVERRIDE_FINALIZE # # DESCRIPTION # # This autoconf macro generalizes the approach given in # <> which # moves user specified values for variable 'varname' given at configure # time into the corresponding AM_${varname} variable and clears out # 'varname', allowing further manipulation by the configure script so that # target specific variables can be given specialized versions. 'varname # may still be specified on the make command line and will be appended as # usual. # # As an example usage, consider a project which might benefit from # different compiler flags for different components. Typically this is # done via target specific flags, e.g. # # libgtest_la_CXXFLAGS = \ # -I $(top_srcdir)/tests \ # -I $(top_builddir)/tests \ # $(GTEST_CXXFLAGS) # # automake will automatically append $(CXXFLAGS) -- provided by the user # -- to the build rule for libgtest_la. That might be problematic, as # CXXFLAGS may contain compiler options which are inappropriate for # libgtest_la. # # The approach laid out in the referenced mailing list message is to # supply a base value for a variable during _configure_ time, during which # it is possible to amend it for specific targets. The user may # subsequently specify a value for the variable during _build_ time, which # make will apply (via the standard automake rules) to all appropriate # targets. # # For example, # # AX_AM_OVERRIDE_VAR([CXXFLAGS]) # # will store the value of CXXFLAGS specified at configure time into the # AM_CXXFLAGS variable, AC_SUBST it, and clear CXXFLAGS. configure may # then create a target specific set of flags based upon AM_CXXFLAGS, e.g. # # # googletest uses variadic macros, which g++ -pedantic-errors # # is very unhappy about # AC_SUBST([GTEST_CXXFLAGS], # [`AS_ECHO_N(["$AM_CXXFLAGS"]) \ # | sed s/-pedantic-errors/-pedantic/` # ] # ) # # which would be used in a as above. Since CXXFLAGS is # cleared, the configure time value will not affect the build for # libgtest_la. # # Prior to _any other command_ which may set ${varname}, call # # AX_AM_OVERRIDE_VAR([varname]) # # This will preserve the value (if any) passed to configure in # AM_${varname} and AC_SUBST([AM_${varname}). You may pass a space # separated list of variable names, or may call AX_AM_OVERRIDE_VAR # multiple times for the same effect. # # If any subsequent configure commands set ${varname} and you wish to # capture the resultant value into AM_${varname} in the case where # ${varname} was _not_ provided at configure time, call # # AX_AM_OVERRIDE_FINALIZE # # after _all_ commands which might affect any of the variables specified # in calls to AX_AM_OVERRIDE_VAR. This need be done only once, but # repeated calls will not cause harm. # # There is a bit of trickery required to allow further manipulation of the # AM_${varname} in a file. If AM_CFLAGS is used as is in a #, e.g. # # libfoo_la_CFLAGS = $(AM_CFLAGS) # # then automake will emit code in which sets AM_CFLAGS from # the configure'd value. # # If however, AM_CFLAGS is manipulated (i.e. appended to), you will have # to explicitly arrange for the configure'd value to be substituted: # # AM_CFLAGS = @AM_CFLAGS@ # AM_CFLAGS += -lfoo # # or else automake will complain about using += before =. # # LICENSE # # Copyright (c) 2013 Smithsonian Astrophysical Observatory # Copyright (c) 2013 Diab Jerius <> # # Copying and distribution of this file, with or without modification, are # permitted in any medium without royalty provided the copyright notice # and this notice are preserved. This file is offered as-is, without any # warranty. #serial 1 AC_DEFUN([_AX_AM_OVERRIDE_INITIALIZE], [ m4_define([_mst_am_override_vars],[]) ]) # _AX_AM_OVERRIDE_VAR(varname) AC_DEFUN([_AX_AM_OVERRIDE_VAR], [ m4_define([_mst_am_override_vars], m4_defn([_mst_am_override_vars]) $1 ) _mst_am_override_$1_set=false AS_IF( [test "${$1+set}" = set], [AC_SUBST([AM_$1],["$$1"]) $1= _mst_am_override_$1_set=: ] ) ]) # _AX_AM_OVERRIDE_VAR # _AX_AM_OVERRIDE_FINALIZE(varname) AC_DEFUN([_AX_AM_OVERRIDE_FINALIZE], [ AS_IF([$_mst_am_override_$1_set = :], [], [AC_SUBST([AM_$1],["$$1"]) $1= _mst_am_override_$1_set= ] ) AC_SUBST($1) ]) # _AX_AM_OVERRIDE_FINALIZE AC_DEFUN([AX_AM_OVERRIDE_VAR], [ AC_REQUIRE([_AX_AM_OVERRIDE_INITIALIZE]) m4_map_args_w([$1],[_AX_AM_OVERRIDE_VAR(],[)]) ])# AX_OVERRIDE_VAR # AX_AM_OVERRIDE_FINALIZE AC_DEFUN([AX_AM_OVERRIDE_FINALIZE], [ AC_REQUIRE([_AX_AM_OVERRIDE_INITIALIZE]) m4_map_args_w(_mst_am_override_vars,[_AX_AM_OVERRIDE_FINALIZE(],[)]) ]) # AX_AM_OVERRIDE_FINALIZE 07070100000017000081A40000000000000000000000016319106A00000D09000000000000000000000000000000000000003600000000numactl- =========================================================================== # # =========================================================================== # # SYNOPSIS # # AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) # # DESCRIPTION # # Check whether the given FLAG works with the current language's compiler # or gives an error. (Warnings, however, are ignored) # # ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on # success/failure. # # If EXTRA-FLAGS is defined, it is added to the current language's default # flags (e.g. CFLAGS) when the check is done. The check is thus made with # the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to # force the compiler to issue an error when a bad flag is given. # # INPUT gives an alternative input source to AC_COMPILE_IFELSE. # # NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this # macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. # # LICENSE # # Copyright (c) 2008 Guido U. Draheim <> # Copyright (c) 2011 Maarten Bosmans <> # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see <>. # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 3 AC_DEFUN([AX_CHECK_COMPILE_FLAG], [AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], [AS_VAR_SET(CACHEVAR,[yes])], [AS_VAR_SET(CACHEVAR,[no])]) _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes], [m4_default([$2], :)], [m4_default([$3], :)]) AS_VAR_POPDEF([CACHEVAR])dnl ])dnl AX_CHECK_COMPILE_FLAGS 07070100000018000081A40000000000000000000000016319106A00000BE3000000000000000000000000000000000000002700000000numactl- =========================================================================== # # =========================================================================== # # SYNOPSIS # # AX_TLS([action-if-found], [action-if-not-found]) # # DESCRIPTION # # Provides a test for the compiler support of thread local storage (TLS) # extensions. Defines TLS if it is found. Currently knows about GCC/ICC # and MSVC. I think SunPro uses the same as GCC, and Borland apparently # supports either. # # LICENSE # # Copyright (c) 2008 Alan Woodland <> # Copyright (c) 2010 Diego Elio Petteno` <> # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the # Free Software Foundation, either version 3 of the License, or (at your # option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General # Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see <>. # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure # scripts that are the output of Autoconf when processing the Macro. You # need not follow the terms of the GNU General Public License when using # or distributing such scripts, even though portions of the text of the # Macro appear in them. The GNU General Public License (GPL) does govern # all other use of the material that constitutes the Autoconf Macro. # # This special exception to the GPL applies to versions of the Autoconf # Macro released by the Autoconf Archive. When you make and distribute a # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. #serial 10 AC_DEFUN([AX_TLS], [ AC_MSG_CHECKING(for thread local storage (TLS) class) AC_CACHE_VAL(ac_cv_tls, [ ax_tls_keywords="__thread __declspec(thread) none" for ax_tls_keyword in $ax_tls_keywords; do AS_CASE([$ax_tls_keyword], [none], [ac_cv_tls=none ; break], [AC_TRY_COMPILE( [#include <stdlib.h> static void foo(void) { static ] $ax_tls_keyword [ int bar; exit(1); }], [], [ac_cv_tls=$ax_tls_keyword ; break], ac_cv_tls=none )]) done ]) AC_MSG_RESULT($ac_cv_tls) AS_IF([test "$ac_cv_tls" != "none"], AC_DEFINE_UNQUOTED([TLS], $ac_cv_tls, [If the compiler supports a TLS storage class define it to that here]) m4_ifnblank([$1], [$1]), m4_ifnblank([$2], [$2]) ) ]) 07070100000019000081A40000000000000000000000016319106A00000088000000000000000000000000000000000000002300000000numactl-!/bin/sh # print names of all functions listed in numa.3 # no globals grep '^\.BI.*numa.*(' numa.3 | sed -e 's/.*\(numa_.*\)(.*/\1/' 0707010000001A000081A40000000000000000000000016319106A000006F6000000000000000000000000000000000000002300000000numactl- MEMHOG 8 "2003,2004" "SuSE Labs" "Linux Administrator's Manual" .SH NAME memhog \- Allocates memory with policy for testing .SH SYNOPSIS .B memhog [ .B \-r<NUM> ] [ .B size kmg ] [ .B policy nodeset ] [ .B \-f<filename> ] .SH DESCRIPTION .B memhog mmaps a memory region for a given size and sets the numa policy (if specified). It then updates the memory region for the given number of iterations using memset. .TS tab(|); l l. -r<num>|Repeat memset NUM times -f<file>|Open file for mmap backing -H|Disable transparent hugepages -size|Allocation size in bytes, may have case-insensitive order |suffix (G=gigabyte, M=megabyte, K=kilobyte) .TE Supported numa-policies: .TP .B interleave Memory will be allocated using round robin on nodes. When memory cannot be allocated on the current interleave, target fall back to other nodes. Multiple nodes may be specified. .TP .B membind Only allocate memory from nodes. Allocation will fail when there is not enough memory available on these nodes. Multiple nodes may be specified. .TP .B preferred Preferably allocate memory on node, but if memory cannot be allocated there fall back to other nodes. This option takes only a single node number. .TP .B default Memory will be allocated on the local node (the node the thread is running on) .SH EXAMPLES .TP # Allocate a 1G region, mmap backed by memhog.mmap file, membind to node 0, repeat test 6 times memhog -r6 1G --membind 0 -fmemhog.mmap .TP # Allocate a 1G region, iterleave across nodes 0,1,2,3, repeat test 4 times memhog -r4 1G --interleave 0-3 .TP # Allocate a 1G region, (implicit) default policy, repeat test 8 times memhog -r8 1G .SH AUTHORS Andi Kleen ( .SH LICENSE GPL v2 .SH SEE ALSO .I mmap(2), memset(3), numactl(8), numastat(8) 0707010000001B000081A40000000000000000000000016319106A00000CDD000000000000000000000000000000000000002300000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. Allocate memory with policy for testing. numactl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numactl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <stdlib.h> #include <stdio.h> #include <sys/mman.h> #include <sys/fcntl.h> #include <string.h> #include <stdbool.h> #include "numa.h" #include "numaif.h" #include "util.h" #define terr(x) perror(x) enum { UNIT = 10*1024*1024, }; #ifndef MADV_NOHUGEPAGE #define MADV_NOHUGEPAGE 15 #endif static void usage(void) { printf("memhog [-fFILE] [-rNUM] size[kmg] [policy [nodeset]]\n"); printf("-f mmap is backed by FILE\n"); printf("-rNUM repeat memset NUM times\n"); printf("-H disable transparent hugepages\n"); print_policies(); exit(1); } long length; static void hog(void *map) { long i; for (i = 0; i < length; i += UNIT) { long left = length - i; if (left > UNIT) left = UNIT; putchar('.'); fflush(stdout); memset(map + i, 0xff, left); } putchar('\n'); } int main(int ac, char **av) { char *map; struct bitmask *nodes, *gnodes; int policy, gpolicy; int ret = 0; int loose = 0; int i; int fd = -1; bool disable_hugepage = false; int repeat = 1; nodes = numa_allocate_nodemask(); gnodes = numa_allocate_nodemask(); while (av[1] && av[1][0] == '-') { switch (av[1][1]) { case 'f': fd = open(av[1]+2, O_RDWR); if (fd < 0) perror(av[1]+2); break; case 'r': repeat = atoi(av[1] + 2); break; case 'H': disable_hugepage = true; break; default: usage(); } av++; } if (!av[1]) usage(); length = memsize(av[1]); if (av[2] && numa_available() < 0) { printf("Kernel doesn't support NUMA policy\n"); } else loose = 1; policy = parse_policy(av[2], av[3]); if (policy == MPOL_MAX) usage(); if (policy != MPOL_DEFAULT && policy != MPOL_LOCAL) nodes = numa_parse_nodestring(av[3]); if (!nodes) { printf ("<%s> is invalid\n", av[3]); exit(1); } if (fd >= 0) map = mmap(NULL,length,PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); else map = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (map == (char*)-1) err("mmap"); if (mbind(map, length, policy, nodes->maskp, nodes->size, 0) < 0) terr("mbind"); if (disable_hugepage) madvise(map, length, MADV_NOHUGEPAGE); gpolicy = -1; if (get_mempolicy(&gpolicy, gnodes->maskp, gnodes->size, map, MPOL_F_ADDR) < 0) terr("get_mempolicy"); if (!loose && policy != gpolicy) { ret = 1; printf("policy %d gpolicy %d\n", policy, gpolicy); } if (!loose && !numa_bitmask_equal(gnodes, nodes)) { printf("nodes differ %lx, %lx!\n", gnodes->maskp[0], nodes->maskp[0]); ret = 1; } for (i = 0; i < repeat; i++) hog(map); exit(ret); } 0707010000001C000081A40000000000000000000000016319106A000007E2000000000000000000000000000000000000002900000000numactl-\" t .\" Copyright 2005-2006 Christoph Lameter, Silicon Graphics, Inc. .\" .\" based on Andi Kleen's numactl manpage .\" .TH MIGRATEPAGES 8 "Jan 2005" "SGI" "Linux Administrator's Manual" .SH NAME migratepages \- Migrate the physical location a processes pages .SH SYNOPSIS .B migratepages pid from-nodes to-nodes .SH DESCRIPTION .B migratepages moves the physical location of a processes pages without any changes of the virtual address space of the process. Moving the pages allows one to change the distances of a process to its memory. Performance may be optimized by moving a processes pages to the node where it is executing. If multiple nodes are specified for from-nodes or to-nodes then an attempt is made to preserve the relative location of each page in each nodeset. For example if we move from nodes 2-5 to 7,9,12-13 then the preferred mode of operation is to move pages from 2->7, 3->9, 4->12 and 5->13. However, this is only posssible if enough memory is available. .TP Valid node specifiers .TS tab(:); l l. all:All nodes number:Node number number1{,number2}:Node number1 and Node number2 number1-number2:Nodes from number1 to number2 ! nodes:Invert selection of the following specification. .TE .SH NOTES Requires a NUMA policy aware kernel with support for page migration (linux 2.6.16 and later). migratepages will only move pages that are not shared with other processes if called by a user without administrative priviledges (but with the right to modify the process). migratepages will move all pages if invoked from root (or a user with administrative priviledges). .SH FILES .I /proc/<pid>/numa_maps for information about the NUMA memory use of a process. .SH COPYRIGHT Copyright 2005-2006 Christoph Lameter, Silicon Graphics, Inc. migratepages is under the GNU General Public License, v.2 .SH SEE ALSO .I numactl(8) , .I set_mempolicy(2) , .I get_mempolicy(2) , .I mbind(2) , .I sched_setaffinity(2) , .I sched_getaffinity(2) , .I proc(5) , .I ftok(3) , .I shmat(2) , .I taskset(1) 0707010000001D000081A40000000000000000000000016319106A0000085D000000000000000000000000000000000000002900000000numactl-* * Copyright (C) 2005 Christoph Lameter, Silicon Graphics, Incorporated. * based on Andi Kleen's numactl.c. * * Manual process migration * * migratepages is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; version 2. * * migratepages is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should find a copy of v2 of the GNU General Public License somewhere * on your Linux system; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _GNU_SOURCE #include <getopt.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include "numa.h" #include "util.h" static struct option opts[] = { {"help", 0, 0, 'h' }, { 0 } }; static void usage(void) { fprintf(stderr, "usage: migratepages pid from-nodes to-nodes\n" "\n" "nodes is a comma delimited list of node numbers or A-B ranges or all.\n" ); exit(1); } static void checknuma(void) { static int numa = -1; if (numa < 0) { if (numa_available() < 0) complain("This system does not support NUMA functionality"); } numa = 0; } int main(int argc, char *argv[]) { int c; char *end; int rc; int pid; struct bitmask *fromnodes; struct bitmask *tonodes; while ((c = getopt_long(argc,argv,"h", opts, NULL)) != -1) { switch (c) { default: usage(); } } argv += optind; argc -= optind; if (argc != 3) usage(); checknuma(); pid = strtoul(argv[0], &end, 0); if (*end || end == argv[0]) usage(); fromnodes = numa_parse_nodestring(argv[1]); if (!fromnodes) { printf ("<%s> is invalid\n", argv[1]); exit(1); } tonodes = numa_parse_nodestring(argv[2]); if (!tonodes) { printf ("<%s> is invalid\n", argv[2]); exit(1); } rc = numa_migrate_pages(pid, fromnodes, tonodes); if (rc < 0) { perror("migrate_pages"); return 1; } return 0; } 0707010000001E000081A40000000000000000000000016319106A00000320000000000000000000000000000000000000002500000000numactl-\" t .\" Copyright 2005-2007 Christoph Lameter, Silicon Graphics, Inc. .\" .\" based on Andi Kleen's numactl manpage .\" .TH MIGSPEED 8 "April 2005" "SGI" "Linux Administrator's Manual" .SH NAME migspeed \- Test the speed of page migration .SH SYNOPSIS .B migspeed -p pages from-nodes to-nodes .SH DESCRIPTION .B migspeed attempts to move a sample of pages from the indicated node to the target node and measures the time it takes to perform the move. .B -p pages The default sample is 1000 pages. Override that with another number. .SH NOTES Requires a NUMA policy aware kernel with support for page migration (Linux 2.6.16 and later). .SH COPYRIGHT Copyright 2007 Christoph Lameter, Silicon Graphics, Inc. migratepages is under the GNU General Public License, v.2 .SH SEE ALSO .I numactl(8) 0707010000001F000081A40000000000000000000000016319106A00000D64000000000000000000000000000000000000002500000000numactl-* * Migration test program * * (C) 2007 Silicon Graphics, Inc. Christoph Lameter <> * */ #include <stdio.h> #include <stdlib.h> #include "numa.h" #include "numaif.h" #include <time.h> #include <errno.h> #include <malloc.h> #include <unistd.h> #include "util.h" static const char *optstr = "hvp:"; static char *cmd; static int verbose; static unsigned long pages = 1000; static void usage(void) { printf("usage %s [-p pages] [-h] [-v] from-nodes to-nodes\n", cmd); printf(" from and to nodes may specified in form N or N-N\n"); printf(" -p pages number of pages to try (defaults to %ld)\n", pages); printf(" -v verbose\n"); printf(" -h usage\n"); exit(1); } static void displaymap(void) { FILE *f = fopen("/proc/self/numa_maps","r"); if (!f) { printf("/proc/self/numa_maps not accessible.\n"); exit(1); } while (!feof(f)) { char buffer[2000]; if (!fgets(buffer, sizeof(buffer), f)) break; if (!strstr(buffer, "bind")) continue ; printf("%s", buffer); } fclose(f); } int main(int argc, char *argv[]) { char *p; int option; struct timespec result; unsigned long bytes; double duration, mbytes; struct bitmask *from; struct bitmask *to; char *memory = NULL; unsigned long pagesize; struct timespec start,end; pagesize = getpagesize(); /* Command line processing */ opterr = 1; cmd = argv[0]; while ((option = getopt(argc, argv, optstr)) != EOF) switch (option) { case 'h' : case '?' : usage(); case 'v' : verbose++; break; case 'p' : pages = strtoul(optarg, &p, 0); if (p == optarg || *p) usage(); break; } if (!argv[optind]) usage(); if (verbose > 1) printf("numa_max_node = %d\n", numa_max_node()); numa_exit_on_error = 1; from = numa_parse_nodestring(argv[optind]); if (!from) { printf ("<%s> is invalid\n", argv[optind]); exit(1); } if (errno) { perror("from mask"); exit(1); } if (verbose) printmask("From", from); if (!argv[optind+1]) usage(); to = numa_parse_nodestring(argv[optind+1]); if (!to) { printf ("<%s> is invalid\n", argv[optind+1]); exit(1); } if (errno) { perror("to mask"); exit(1); } if (verbose) printmask("To", to); bytes = pages * pagesize; if (verbose) printf("Allocating %lu pages of %lu bytes of memory\n", pages, pagesize); memory = memalign(pagesize, bytes); if (!memory) { printf("Out of Memory\n"); exit(2); } if (mbind(memory, bytes, MPOL_BIND, from->maskp, from->size, 0) < 0) numa_error("mbind"); if (verbose) printf("Dirtying memory....\n"); for (p = memory; p <= memory + bytes; p += pagesize) *p = 1; if (verbose) printf("Starting test\n"); displaymap(); clock_gettime(CLOCK_REALTIME, &start); if (mbind(memory, bytes, MPOL_BIND, to->maskp, to->size, MPOL_MF_MOVE) <0) numa_error("memory move"); clock_gettime(CLOCK_REALTIME, &end); displaymap(); result.tv_sec = end.tv_sec - start.tv_sec; result.tv_nsec = end.tv_nsec - start.tv_nsec; if (result.tv_nsec < 0) { result.tv_sec--; result.tv_nsec += 1000000000; } if (result.tv_nsec >= 1000000000) { result.tv_sec++; result.tv_nsec -= 1000000000; } duration = result.tv_sec + result.tv_nsec / 1000000000.0; mbytes = bytes / (1024*1024.0); printf("%1.1f Mbyte migrated in %1.2f secs. %3.1f Mbytes/second\n", mbytes, duration, mbytes / duration); return 0; } 07070100000020000081A40000000000000000000000016319106A00001113000000000000000000000000000000000000002700000000numactl-\" Hey Emacs! This file is -*- nroff -*- source. .\" .\" This manpage is Copyright (C) 2006 Silicon Graphics, Inc. .\" Christoph Lameter .\" .\" Permission is granted to make and distribute verbatim copies of this .\" manual provided the copyright notice and this permission notice are .\" preserved on all copies. .\" .\" Permission is granted to copy and distribute modified versions of this .\" manual under the conditions for verbatim copying, provided that the .\" entire resulting derived work is distributed under the terms of a .\" permission notice identical to this one. .\" .TH MOVE_PAGES 2 2006-10-31 "Linux 2.6.18" "Linux Programmer's Manual" .SH NAME move_pages \- Move individual pages of a process to another node .SH SYNOPSIS .B #include <numaif.h> .sp .BI "long move_pages(int " pid ", unsigned long count, void ** " pages ", const int * " nodes ", int * " status ", int " flags ); .SH DESCRIPTION .BR move_pages () moves .I count pages to the .I nodes. The result of the move is reflected in .I status. The .I flags indicate constraints on the pages to be moved. .I pid is the process id in which pages are to be moved. Sufficient rights must exist to move pages of another process. This means the moving process either has root priviledges, has SYS_NICE administrative rights or the same owner. If pid is 0 then we move pages of the current process. .I count is the number of pages to move. It defines the size of the three arrays .I pages, .I nodes and .I status. .I pages is an array of pointers to the pages that should be moved. These are pointers that should be aligned to page boundaries. Addresses are specified as seen by the process specified by .I pid. .I nodes is either an array of integers that specify the desired location for each page or it is NULL. Each integer is a node number. If NULL is specified then move_pages will not move any pages but return the node of each page in the .I status array. Having the status of each page may be necessary to determine pages that need to be moved. .I status is an array of integers that return the status of each page. The array only contains valid values if .I move_pages did not return an error code. .I flags specify what types of pages to move. .B MPOL_MF_MOVE means that only pages that are in exclusive use by the process are to be moved. .B MPOL_MF_MOVE_ALL means that pages shared between multiple processes can also be moved. The process must have root priviledges or SYS_NICE priviledges. .SH Page states in the status array .TP .B 0..MAX_NUMNODES Indicates that the location of the page is on this node. .TP .B -ENOENT The page is not present. .TP .B -EACCES The page is mapped by multiple processes and can only be moved if .I MPOL_MF_MOVE_ALL is specified. .TP .B -EBUSY The page is currently busy and cannot be moved. Try again later. This occurs if a page is undergoing I/O or another kernel subsystem is holding a reference to the page. .TP .B -EFAULT This is a zero page or the memory area is not mapped by the process. .TP .B -ENOMEM Unable to allocate memory on target node. .TP .B -EIO Unable to write back a page. The page has to be written back in order to move ti since the page is dirty and the filesystem has not provide a migration function that would allow the move of dirty pages. .TP .B -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. .SH "RETURN VALUE" On success .B move_pages returns zero. .SH ERRORS .TP .B -ENOENT No pages were found that require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. .TP .B -EINVAL Flags other than .I MPOL_MF_MOVE and .I MPOL_MF_MOVE_ALL was specified or an attempt was made to migrate pages of a kernel thread. .TP .B -EPERM .I MPOL_MF_MOVE_ALL specified without sufficient privileges or an attempt to move a process belonging to another user. .TP .B -EACCESS On of the target nodes is not allowed by the current cpuset. .TP .B -ENODEV On of the target nodes is not online. .TP .B -ESRCH Process does not exist. .TP .B -E2BIG Too many pages to move. .TP .B -EFAULT Parameter array could not be accessed. .SH "SEE ALSO" .BR numa_maps (5), .BR migratepages (8), .BR numa_stat (8), .BR numa (3) 07070100000021000081A40000000000000000000000016319106A00000485000000000000000000000000000000000000001F00000000numactl-* Mersenne twister implementation from Michael Brundage. Public Domain. MT is a very fast pseudo random number generator. This version works on 32bit words. Changes by AK. */ #include <stdlib.h> #include "mt.h" int mt_index; unsigned int mt_buffer[MT_LEN]; void mt_init(void) { int i; srand(1); for (i = 0; i < MT_LEN; i++) mt_buffer[i] = rand(); mt_index = 0; } #define MT_IA 397 #define MT_IB (MT_LEN - MT_IA) #define UPPER_MASK 0x80000000 #define LOWER_MASK 0x7FFFFFFF #define MATRIX_A 0x9908B0DF #define TWIST(b,i,j) ((b)[i] & UPPER_MASK) | ((b)[j] & LOWER_MASK) #define MAGIC(s) (((s)&1)*MATRIX_A) void mt_refill(void) { int i; unsigned int s; unsigned int * b = mt_buffer; mt_index = 0; i = 0; for (; i < MT_IB; i++) { s = TWIST(b, i, i+1); b[i] = b[i + MT_IA] ^ (s >> 1) ^ MAGIC(s); } for (; i < MT_LEN-1; i++) { s = TWIST(b, i, i+1); b[i] = b[i - MT_IB] ^ (s >> 1) ^ MAGIC(s); } s = TWIST(b, MT_LEN-1, 0); b[MT_LEN-1] = b[MT_IA-1] ^ (s >> 1) ^ MAGIC(s); } 07070100000022000081A40000000000000000000000016319106A000001AB000000000000000000000000000000000000001F00000000numactl- MT_LEN 624 extern void mt_init(void); extern void mt_refill(void); extern int mt_index; extern unsigned int mt_buffer[MT_LEN]; static inline unsigned int mt_random(void) { unsigned int * b = mt_buffer; int idx = mt_index; if (idx == MT_LEN*sizeof(unsigned int)) { mt_refill(); idx = 0; } mt_index += sizeof(unsigned int); return *(unsigned int *)((unsigned char *)b + idx); } 07070100000023000081A40000000000000000000000016319106A000088AB000000000000000000000000000000000000002100000000numactl-\" Copyright 2003,2004 Andi Kleen, SuSE Labs. .\" .\" Permission is granted to make and distribute verbatim copies of this .\" manual provided the copyright notice and this permission notice are .\" preserved on all copies. .\" .\" Permission is granted to copy and distribute modified versions of this .\" manual under the conditions for verbatim copying, provided that the .\" entire resulting derived work is distributed under the terms of a .\" permission notice identical to this one. .\" .\" Since the Linux kernel and libraries are constantly changing, this .\" manual page may be incorrect or out-of-date. The author(s) assume no .\" responsibility for errors or omissions, or for damages resulting from .\" the use of the information contained herein. .\" .\" Formatted or processed versions of this manual, if unaccompanied by .\" the source, must acknowledge the copyright and authors of this work. .TH NUMA 3 "December 2007" "SuSE Labs" "Linux Programmer's Manual" .SH NAME numa \- NUMA policy library .SH SYNOPSIS .B #include <numa.h> .sp .B cc ... \-lnuma .sp .B int numa_available(void); .sp .BI "int numa_max_possible_node(void);" .br .BI "int numa_num_possible_nodes();" .sp .B int numa_max_node(void); .br .BI "int numa_num_configured_nodes();" .br .B struct bitmask *numa_get_mems_allowed(void); .sp .BI "int numa_num_configured_cpus(void);" .br .BI "struct bitmask *numa_all_nodes_ptr;" .br .BI "struct bitmask *numa_no_nodes_ptr;" .br .BI "struct bitmask *numa_all_cpus_ptr;" .sp .BI "int numa_num_task_cpus();" .br .BI "int numa_num_task_nodes();" .sp .BI "int numa_parse_bitmap(char *" line " , struct bitmask *" mask "); .br .BI "struct bitmask *numa_parse_nodestring(const char *" string ); .br .BI "struct bitmask *numa_parse_nodestring_all(const char *" string ); .br .BI "struct bitmask *numa_parse_cpustring(const char *" string ); .br .BI "struct bitmask *numa_parse_cpustring_all(const char *" string ); .sp .BI "long long numa_node_size(int " node ", long long*" freep ); .br .BI "long long numa_node_size64(int " node ", long long *" freep ); .sp .B int numa_preferred(void); .br .B int numa_has_preferred_many(void); .br .B struct bitmask *numa_preferred_many(void); .br .BI "void numa_set_preferred(int " node ); .br .BI "void numa_set_preferred_many(struct bitmask *" nodemask ); .br .BI "int numa_get_interleave_node(void); .br .B struct bitmask *numa_get_interleave_mask(void); .br .BI "void numa_set_interleave_mask(struct bitmask *" nodemask ); .br .BI "void numa_interleave_memory(void *" start ", size_t " size ", struct bitmask *" nodemask ); .br .BI "void numa_bind(struct bitmask *" nodemask ); .br .BI "void numa_set_localalloc(void); .br .BI "void numa_set_membind(struct bitmask *" nodemask ); .br .BI "void numa_set_membind_balancing(struct bitmask *" nodemask ); .br .B struct bitmask *numa_get_membind(void); .sp .BI "void *numa_alloc_onnode(size_t " size ", int " node ); .br .BI "void *numa_alloc_local(size_t " size ); .br .BI "void *numa_alloc_interleaved(size_t " size ); .br .BI "void *numa_alloc_interleaved_subset(size_t " size ", struct bitmask *" nodemask ); .BI "void *numa_alloc(size_t " size ); .br .BI "void *numa_realloc(void *"old_addr ", size_t " old_size ", size_t " new_size ); .br .BI "void numa_free(void *" start ", size_t " size ); .sp .BI "int numa_run_on_node(int " node ); .br .BI "int numa_run_on_node_mask(struct bitmask *" nodemask ); .br .BI "int numa_run_on_node_mask_all(struct bitmask *" nodemask ); .br .B struct bitmask *numa_get_run_node_mask(void); .sp .BI "void numa_tonode_memory(void *" start ", size_t " size ", int " node ); .br .BI "void numa_tonodemask_memory(void *" start ", size_t " size ", struct bitmask *" nodemask ); .br .BI "void numa_setlocal_memory(void *" start ", size_t " size ); .br .BI "void numa_police_memory(void *" start ", size_t " size ); .br .BI "void numa_set_bind_policy(int " strict ); .br .BI "void numa_set_strict(int " strict ); .sp .\" should be undocumented ?? .BI "int numa_distance(int " node1 ", int " node2 ); .sp .BI "int numa_sched_getaffinity(pid_t " pid ", struct bitmask *" mask ); .br .BI "int numa_sched_setaffinity(pid_t " pid ", struct bitmask *" mask ); .br .BI "int numa_node_to_cpus(int " node ", struct bitmask *" mask "); .br .BI "void numa_node_to_cpu_update();" .br .BI "int numa_node_of_cpu(int " cpu "); .sp .BI "struct bitmask *numa_allocate_cpumask();" .sp .BI "void numa_free_cpumask();" .br .BI "struct bitmask *numa_allocate_nodemask();" .sp .BI "void numa_free_nodemask();" .br .BI "struct bitmask *numa_bitmask_alloc(unsigned int " n "); .br .BI "struct bitmask *numa_bitmask_clearall(struct bitmask *" bmp ); .br .BI "struct bitmask *numa_bitmask_clearbit(struct bitmask *" bmp ", unsigned int " n ); .br .BI "int numa_bitmask_equal(const struct bitmask *" bmp1 ", const struct bitmask *" bmp2 ); .br .BI "void numa_bitmask_free(struct bitmask *" bmp ); .br .BI "int numa_bitmask_isbitset(const struct bitmask *" bmp ", unsigned int " n ");" .br .BI "unsigned int numa_bitmask_nbytes(struct bitmask *" bmp ); .br .BI "struct bitmask *numa_bitmask_setall(struct bitmask *" bmp ); .br .BI "struct bitmask *numa_bitmask_setbit(struct bitmask *" bmp ", unsigned int " n ); .br .BI "void copy_bitmask_to_nodemask(struct bitmask *" bmp ", nodemask_t *" nodemask ) .br .BI "void copy_nodemask_to_bitmask(nodemask_t *" nodemask ", struct bitmask *" bmp ) .br .BI "void copy_bitmask_to_bitmask(struct bitmask *" bmpfrom ", struct bitmask *" bmpto ) .br .BI "unsigned int numa_bitmask_weight(const struct bitmask *bmp ) .sp .BI "int numa_move_pages(int " pid ", unsigned long " count ", void **" pages ", const int *" nodes ", int *" status ", int " flags ); .br .BI "int numa_migrate_pages(int " pid ", struct bitmask *" fromnodes ", struct bitmask *" tonodes ); .sp .BI "void numa_error(char *" where ); .sp .BI "extern int " numa_exit_on_error ; .br .BI "extern int " numa_exit_on_warn ; .br .BI "void numa_warn(int " number ", char *" where ", ...);" .br .SH DESCRIPTION The .I libnuma library offers a simple programming interface to the NUMA (Non Uniform Memory Access) policy supported by the Linux kernel. On a NUMA architecture some memory areas have different latency or bandwidth than others. Available policies are page interleaving (i.e., allocate in a round-robin fashion from all, or a subset, of the nodes on the system), preferred node allocation (i.e., preferably allocate on a particular node), local allocation (i.e., allocate on the node on which the task is currently executing), or allocation only on specific nodes (i.e., allocate on some subset of the available nodes). It is also possible to bind tasks to specific nodes. Numa memory allocation policy may be specified as a per-task attribute, that is inherited by children tasks and processes, or as an attribute of a range of process virtual address space. Numa memory policies specified for a range of virtual address space are shared by all tasks in the process. Furthermore, memory policies specified for a range of a shared memory attached using .I shmat(2) or .I mmap(2) from shmfs/hugetlbfs are shared by all processes that attach to that region. Memory policies for shared disk backed file mappings are currently ignored. The default memory allocation policy for tasks and all memory range is local allocation. This assumes that no ancestor has installed a non-default policy. For setting a specific policy globally for all memory allocations in a process and its children it is easiest to start it with the .BR numactl (8) utility. For more finegrained policy inside an application this library can be used. All numa memory allocation policy only takes effect when a page is actually faulted into the address space of a process by accessing it. The .B numa_alloc_* functions take care of this automatically. A .I node is defined as an area where all memory has the same speed as seen from a particular CPU. A node can contain multiple CPUs. Caches are ignored for this definition. Most functions in this library are only concerned about numa nodes and their memory. The exceptions to this are: .IR numa_node_to_cpus (), .IR numa_node_to_cpu_update (), .IR numa_node_of_cpu (), .IR numa_bind (), .IR numa_run_on_node (), .IR numa_run_on_node_mask (), .IR numa_run_on_node_mask_all (), and .IR numa_get_run_node_mask (). These functions deal with the CPUs associated with numa nodes. See the descriptions below for more information. Some of these functions accept or return a pointer to struct bitmask. A struct bitmask controls a bit map of arbitrary length containing a bit representation of nodes. The predefined variable .I numa_all_nodes_ptr points to a bit mask that has all available nodes set; .I numa_no_nodes_ptr points to the empty set. Before any other calls in this library can be used .BR numa_available () must be called. If it returns \-1, all other functions in this library are undefined. .BR numa_max_possible_node() returns the number of the highest possible node in a system. In other words, the size of a kernel type nodemask_t (in bits) minus 1. This number can be gotten by calling .BR numa_num_possible_nodes() and subtracting 1. .BR numa_num_possible_nodes() returns the size of kernel's node mask (kernel type nodemask_t). In other words, large enough to represent the maximum number of nodes that the kernel can handle. This will match the kernel's MAX_NUMNODES value. This count is derived from /proc/self/status, field Mems_allowed. .BR numa_max_node () returns the highest node number available on the current system. (See the node numbers in /sys/devices/system/node/ ). Also see .BR numa_num_configured_nodes(). .BR numa_num_configured_nodes() returns the number of memory nodes in the system. This count includes any nodes that are currently disabled. This count is derived from the node numbers in /sys/devices/system/node. (Depends on the kernel being configured with /sys (CONFIG_SYSFS)). .BR numa_get_mems_allowed() returns the mask of nodes from which the process is allowed to allocate memory in it's current cpuset context. Any nodes that are not included in the returned bitmask will be ignored in any of the following libnuma memory policy calls. .BR numa_num_configured_cpus() returns the number of cpus in the system. This count includes any cpus that are currently disabled. This count is derived from the cpu numbers in /sys/devices/system/cpu. If the kernel is configured without /sys (CONFIG_SYSFS=n) then it falls back to using the number of online cpus. .BR numa_all_nodes_ptr points to a bitmask that is allocated by the library with bits representing all nodes on which the calling task may allocate memory. This set may be up to all nodes on the system, or up to the nodes in the current cpuset. The bitmask is allocated by a call to .BR numa_allocate_nodemask() using size .BR numa_max_possible_node(). The set of nodes to record is derived from /proc/self/status, field "Mems_allowed". The user should not alter this bitmask. .BR numa_no_nodes_ptr points to a bitmask that is allocated by the library and left all zeroes. The bitmask is allocated by a call to .BR numa_allocate_nodemask() using size .BR numa_max_possible_node(). The user should not alter this bitmask. .BR numa_all_cpus_ptr points to a bitmask that is allocated by the library with bits representing all cpus on which the calling task may execute. This set may be up to all cpus on the system, or up to the cpus in the current cpuset. The bitmask is allocated by a call to .BR numa_allocate_cpumask() using size .BR numa_num_possible_cpus(). The set of cpus to record is derived from /proc/self/status, field "Cpus_allowed". The user should not alter this bitmask. .BR numa_num_task_cpus() returns the number of cpus that the calling task is allowed to use. This count is derived from the map /proc/self/status, field "Cpus_allowed". Also see the bitmask .BR numa_all_cpus_ptr. .BR numa_num_task_nodes() returns the number of nodes on which the calling task is allowed to allocate memory. This count is derived from the map /proc/self/status, field "Mems_allowed". Also see the bitmask .BR numa_all_nodes_ptr. .BR numa_parse_bitmap() parses .I line , which is a character string such as found in /sys/devices/system/node/nodeN/cpumap into a bitmask structure. The string contains the hexadecimal representation of a bit map. The bitmask may be allocated with .BR numa_allocate_cpumask(). Returns 0 on success. Returns -1 on failure. This function is probably of little use to a user application, but it is used by .I libnuma internally. .BR numa_parse_nodestring() parses a character string list of nodes into a bit mask. The bit mask is allocated by .BR numa_allocate_nodemask(). The string is a comma-separated list of node numbers or node ranges. A leading ! can be used to indicate "not" this list (in other words, all nodes except this list), and a leading + can be used to indicate that the node numbers in the list are relative to the task's cpuset. The string can be "all" to specify all ( .BR numa_num_task_nodes() ) nodes. Node numbers are limited by the number in the system. See .BR numa_max_node() and .BR numa_num_configured_nodes(). .br Examples: 1-5,7,10 !4-5 +0-3 .br If the string is of 0 length, bitmask .BR numa_no_nodes_ptr is returned. Returns 0 if the string is invalid. .BR numa_parse_nodestring_all() is similar to .BR numa_parse_nodestring , but can parse all possible nodes, not only current nodeset. .BR numa_parse_cpustring() parses a character string list of cpus into a bit mask. The bit mask is allocated by .BR numa_allocate_cpumask(). The string is a comma-separated list of cpu numbers or cpu ranges. A leading ! can be used to indicate "not" this list (in other words, all cpus except this list), and a leading + can be used to indicate that the cpu numbers in the list are relative to the task's cpuset. The string can be "all" to specify all ( .BR numa_num_task_cpus() ) cpus. Cpu numbers are limited by the number in the system. See .BR numa_num_task_cpus() and .BR numa_num_configured_cpus(). .br Examples: 1-5,7,10 !4-5 +0-3 .br Returns 0 if the string is invalid. .BR numa_parse_cpustring_all() is similar to .BR numa_parse_cpustring , but can parse all possible cpus, not only current cpuset. .BR numa_node_size () returns the memory size of a node. If the argument .I freep is not NULL, it used to return the amount of free memory on the node. On error it returns \-1. .BR numa_node_size64 () works the same as .BR numa_node_size (). This is useful on 32-bit architectures with large nodes. .BR numa_preferred () returns the preferred node of the current task. This is the node on which the kernel preferably allocates memory, unless some other policy overrides this. .\" TODO: results are misleading for MPOL_PREFERRED and may .\" be incorrect for MPOL_BIND when Mel Gorman's twozonelist .\" patches go in. In the latter case, we'd need to know the .\" order of the current node's zonelist to return the correct .\" node. Need to tighten this up with the syscall results. .BR numa_has_preferred_many () Returns > 0 if the system supports multiple preferred nodes. .BR numa_preferred_many () Returns the current set of preferred nodes. This implies the empty set when the policy isn't one used for preference .I (PREFERRED, PREFERRED_MANY, BIND). The caller is responsible for freeing the mask with .BR numa_bitmask_free (). .BR numa_set_preferred () sets the preferred node for the current task to .IR node . The system will attempt to allocate memory from the preferred node, but will fall back to other nodes if no memory is available on the the preferred node. Passing a .I node of \-1 argument specifies local allocation and is equivalent to calling .BR numa_set_localalloc (). .BR numa_set_preferred_many () sets the preferred set of nodes for the current task to .IR nodemask . This is similar to .BR numa_set_preferred () with the exception that it utilizes a different kernel interface to specify multiple preferred nodes. The caller is responsible for freeing the mask with .BR numa_bitmask_free (). .BR numa_get_interleave_mask () returns the current interleave mask if the task's memory allocation policy is page interleaved. Otherwise, this function returns an empty mask. .BR numa_set_interleave_mask () sets the memory interleave mask for the current task to .IR nodemask . All new memory allocations are page interleaved over all nodes in the interleave mask. Interleaving can be turned off again by passing an empty mask .RI ( numa_no_nodes ). The page interleaving only occurs on the actual page fault that puts a new page into the current address space. It is also only a hint: the kernel will fall back to other nodes if no memory is available on the interleave target. .\" NOTE: the following is not really the case. this function sets the .\" task policy for all future allocations, including stack, bss, ... .\" The functions specified in this sentence actually allocate a new memory .\" range [via mmap()]. This is quite a different thing. Suggest we drop .\" this. .\" This is a low level .\" function, it may be more convenient to use the higher level functions like .\" .BR numa_alloc_interleaved () .\" or .\" .BR numa_alloc_interleaved_subset (). .BR numa_interleave_memory () interleaves .I size bytes of memory page by page from .I start on nodes specified in .IR nodemask . The .I size argument will be rounded up to a multiple of the system page size. If .I nodemask contains nodes that are externally denied to this process, this call will fail. This is a lower level function to interleave allocated but not yet faulted in memory. Not yet faulted in means the memory is allocated using .BR mmap (2) or .BR shmat (2), but has not been accessed by the current process yet. The memory is page interleaved to all nodes specified in .IR nodemask . Normally .BR numa_alloc_interleaved () should be used for private memory instead, but this function is useful to handle shared memory areas. To be useful the memory area should be several megabytes at least (or tens of megabytes of hugetlbfs mappings) If the .BR numa_set_strict () flag is true then the operation will cause a numa_error if there were already pages in the mapping that do not follow the policy. .BR numa_bind () binds the current task and its children to the nodes specified in .IR nodemask . They will only run on the CPUs of the specified nodes and only be able to allocate memory from them. This function is equivalent to calling .\" FIXME checkme .\" This is the case. --lts .I numa_run_on_node_mask(nodemask) followed by .IR numa_set_membind(nodemask) . If tasks should be bound to individual CPUs inside nodes consider using .I numa_node_to_cpus and the .I sched_setaffinity(2) syscall. .BR numa_set_localalloc () sets the memory allocation policy for the calling task to local allocation. In this mode, the preferred node for memory allocation is effectively the node where the task is executing at the time of a page allocation. .BR numa_set_membind () sets the memory allocation mask. The task will only allocate memory from the nodes set in .IR nodemask . Passing an empty .I nodemask or a .I nodemask that contains nodes other than those in the mask returned by .IR numa_get_mems_allowed () will result in an error. .BR numa_set_membind_balancing () sets the memory allocation mask and enable the Linux kernel NUMA balancing for the task if the feature is supported by the kernel. The task will only allocate memory from the nodes set in .IR nodemask . Passing an empty .I nodemask or a .I nodemask that contains nodes other than those in the mask returned by .IR numa_get_mems_allowed () will result in an error. .BR numa_get_membind () returns the mask of nodes from which memory can currently be allocated. If the returned mask is equal to .IR numa_all_nodes , then memory allocation is allowed from all nodes. .BR numa_alloc_onnode () allocates memory on a specific node. The .I size argument will be rounded up to a multiple of the system page size. if the specified .I node is externally denied to this process, this call will fail. This function is relatively slow compared to the .IR malloc (3) family of functions. The memory must be freed with .BR numa_free (). On errors NULL is returned. .BR numa_alloc_local () allocates .I size bytes of memory on the local node. The .I size argument will be rounded up to a multiple of the system page size. This function is relatively slow compared to the .IR malloc (3) family of functions. The memory must be freed with .BR numa_free (). On errors NULL is returned. .BR numa_alloc_interleaved () allocates .I size bytes of memory page interleaved on all nodes. This function is relatively slow and should only be used for large areas consisting of multiple pages. The interleaving works at page level and will only show an effect when the area is large. The allocated memory must be freed with .BR numa_free (). On error, NULL is returned. .BR numa_alloc_interleaved_subset () attempts to allocate .I size bytes of memory page interleaved on all nodes. The .I size argument will be rounded up to a multiple of the system page size. The nodes on which a process is allowed to allocate memory may be constrained externally. If this is the case, this function may fail. This function is relatively slow compared to the .IR malloc (3) family of functions and should only be used for large areas consisting of multiple pages. The interleaving works at page level and will only show an effect when the area is large. The allocated memory must be freed with .BR numa_free (). On error, NULL is returned. .BR numa_alloc () allocates .I size bytes of memory with the current NUMA policy. The .I size argument will be rounded up to a multiple of the system page size. This function is relatively slow compared to the .IR malloc (3) family of functions. The memory must be freed with .BR numa_free (). On errors NULL is returned. .BR numa_realloc () changes the size of the memory area pointed to by .I old_addr from .I old_size to .I new_size. The memory area pointed to by .I old_addr must have been allocated with one of the .BR numa_alloc* functions. The .I new_size will be rounded up to a multiple of the system page size. The contents of the memory area will be unchanged to the minimum of the old and new sizes; newly allocated memory will be uninitialized. The memory policy (and node bindings) associated with the original memory area will be preserved in the resized area. For example, if the initial area was allocated with a call to .BR numa_alloc_onnode(), then the new pages (if the area is enlarged) will be allocated on the same node. However, if no memory policy was set for the original area, then .BR numa_realloc () cannot guarantee that the new pages will be allocated on the same node. On success, the address of the resized area is returned (which might be different from that of the initial area), otherwise NULL is returned and .I errno is set to indicate the error. The pointer returned by .BR numa_realloc () is suitable for passing to .BR numa_free (). .BR numa_free () frees .I size bytes of memory starting at .IR start , allocated by the .B numa_alloc_* functions above. The .I size argument will be rounded up to a multiple of the system page size. .BR numa_run_on_node () runs the current task and its children on a specific node. They will not migrate to CPUs of other nodes until the node affinity is reset with a new call to .BR numa_run_on_node_mask (). Passing \-1 permits the kernel to schedule on all nodes again. On success, 0 is returned; on error \-1 is returned, and .I errno is set to indicate the error. .BR numa_run_on_node_mask () runs the current task and its children only on nodes specified in .IR nodemask . They will not migrate to CPUs of other nodes until the node affinity is reset with a new call to .BR numa_run_on_node_mask () or .BR numa_run_on_node (). Passing .I numa_all_nodes permits the kernel to schedule on all nodes again. On success, 0 is returned; on error \-1 is returned, and .I errno is set to indicate the error. .BR numa_run_on_node_mask_all () runs the current task and its children only on nodes specified in .IR nodemask like .I numa_run_on_node_mask but without any cpuset awareness. .BR numa_get_run_node_mask () returns a mask of CPUs on which the current task is allowed to run. .BR numa_tonode_memory () put memory on a specific node. The constraints described for .BR numa_interleave_memory () apply here too. .BR numa_tonodemask_memory () put memory on a specific set of nodes. The constraints described for .BR numa_interleave_memory () apply here too. .BR numa_setlocal_memory () locates memory on the current node. The constraints described for .BR numa_interleave_memory () apply here too. .BR numa_police_memory () locates memory with the current NUMA policy. The constraints described for .BR numa_interleave_memory () apply here too. .BR numa_distance () reports the distance in the machine topology between two nodes. The factors are a multiple of 10. It returns 0 when the distance cannot be determined. A node has distance 10 to itself. Reporting the distance requires a Linux kernel version of .I 2.6.10 or newer. .BR numa_set_bind_policy () specifies whether calls that bind memory to a specific node should use the preferred policy or a strict policy. The preferred policy allows the kernel to allocate memory on other nodes when there isn't enough free on the target node. strict will fail the allocation in that case. Setting the argument to specifies strict, 0 preferred. Note that specifying more than one node non strict may only use the first node in some kernel versions. .BR numa_set_strict () sets a flag that says whether the functions allocating on specific nodes should use use a strict policy. Strict means the allocation will fail if the memory cannot be allocated on the target node. Default operation is to fall back to other nodes. This doesn't apply to interleave and default. .BR numa_get_interleave_node() is used by .I libnuma internally. It is probably not useful for user applications. It uses the MPOL_F_NODE flag of the get_mempolicy system call, which is not intended for application use (its operation may change or be removed altogether in future kernel versions). See get_mempolicy(2). .BR numa_pagesize() returns the number of bytes in page. This function is simply a fast alternative to repeated calls to the getpagesize system call. See getpagesize(2). .BR numa_sched_getaffinity() retrieves a bitmask of the cpus on which a task may run. The task is specified by .I pid. Returns the return value of the sched_getaffinity system call. See sched_getaffinity(2). The bitmask must be at least the size of the kernel's cpu mask structure. Use .BR numa_allocate_cpumask() to allocate it. Test the bits in the mask by calling .BR numa_bitmask_isbitset(). .BR numa_sched_setaffinity() sets a task's allowed cpu's to those cpu's specified in .I mask. The task is specified by .I pid. Returns the return value of the sched_setaffinity system call. See sched_setaffinity(2). You may allocate the bitmask with .BR numa_allocate_cpumask(). Or the bitmask may be smaller than the kernel's cpu mask structure. For example, call .BR numa_bitmask_alloc() using a maximum number of cpus from .BR numa_num_configured_cpus(). Set the bits in the mask by calling .BR numa_bitmask_setbit(). .BR numa_node_to_cpus () converts a node number to a bitmask of CPUs. The user must pass a bitmask structure with a mask buffer long enough to represent all possible cpu's. Use numa_allocate_cpumask() to create it. If the bitmask is not long enough .I errno will be set to .I ERANGE and \-1 returned. On success 0 is returned. .BR numa_node_to_cpu_update () Mark cpus bitmask of all nodes stale, then get the latest bitmask by calling .BR numa_node_to_cpus () This allows to update the libnuma state after a CPU hotplug event. The application is in charge of detecting CPU hotplug events. .BR numa_node_of_cpu () returns the node that a cpu belongs to. If the user supplies an invalid cpu .I errno will be set to .I EINVAL and \-1 will be returned. .BR numa_allocate_cpumask () returns a bitmask of a size equal to the kernel's cpu mask (kernel type cpumask_t). In other words, large enough to represent NR_CPUS cpus. This number of cpus can be gotten by calling .BR numa_num_possible_cpus(). The bitmask is zero-filled. .BR numa_free_cpumask frees a cpumask previously allocate by .I numa_allocate_cpumask. .BR numa_allocate_nodemask() returns a bitmask of a size equal to the kernel's node mask (kernel type nodemask_t). In other words, large enough to represent MAX_NUMNODES nodes. This number of nodes can be gotten by calling .BR numa_num_possible_nodes(). The bitmask is zero-filled. .BR numa_free_nodemask() frees a nodemask previous allocated by .I numa_allocate_nodemask(). .BR numa_bitmask_alloc() allocates a bitmask structure and its associated bit mask. The memory allocated for the bit mask contains enough words (type unsigned long) to contain .I n bits. The bit mask is zero-filled. The bitmask structure points to the bit mask and contains the .I n value. .BR numa_bitmask_clearall() sets all bits in the bit mask to 0. The bitmask structure points to the bit mask and contains its size ( .I bmp ->size). The value of .I bmp is always returned. Note that .BR numa_bitmask_alloc() creates a zero-filled bit mask. .BR numa_bitmask_clearbit() sets a specified bit in a bit mask to 0. Nothing is done if the .I n value is greater than the size of the bitmask (and no error is returned). The value of .I bmp is always returned. .BR numa_bitmask_equal() returns 1 if two bitmasks are equal. It returns 0 if they are not equal. If the bitmask structures control bit masks of different sizes, the "missing" trailing bits of the smaller bit mask are considered to be 0. .BR numa_bitmask_free() deallocates the memory of both the bitmask structure pointed to by .I bmp and the bit mask. It is an error to attempt to free this bitmask twice. .BR numa_bitmask_isbitset() returns the value of a specified bit in a bit mask. If the .I n value is greater than the size of the bit map, 0 is returned. .BR numa_bitmask_nbytes() returns the size (in bytes) of the bit mask controlled by .I bmp. The bit masks are always full words (type unsigned long), and the returned size is the actual size of all those words. .BR numa_bitmask_setall() sets all bits in the bit mask to 1. The bitmask structure points to the bit mask and contains its size ( .I bmp ->size). The value of .I bmp is always returned. .BR numa_bitmask_setbit() sets a specified bit in a bit mask to 1. Nothing is done if .I n is greater than the size of the bitmask (and no error is returned). The value of .I bmp is always returned. .BR copy_bitmask_to_nodemask() copies the body (the bit map itself) of the bitmask structure pointed to by .I bmp to the nodemask_t structure pointed to by the .I nodemask pointer. If the two areas differ in size, the copy is truncated to the size of the receiving field or zero-filled. .BR copy_nodemask_to_bitmask() copies the nodemask_t structure pointed to by the .I nodemask pointer to the body (the bit map itself) of the bitmask structure pointed to by the .I bmp pointer. If the two areas differ in size, the copy is truncated to the size of the receiving field or zero-filled. .BR copy_bitmask_to_bitmask() copies the body (the bit map itself) of the bitmask structure pointed to by the .I bmpfrom pointer to the body of the bitmask structure pointed to by the .I bmpto pointer. If the two areas differ in size, the copy is truncated to the size of the receiving field or zero-filled. .BR numa_bitmask_weight() returns a count of the bits that are set in the body of the bitmask pointed to by the .I bmp argument. .br .BR numa_move_pages() moves a list of pages in the address space of the currently executing or current process. It simply uses the move_pages system call. .br .I pid - ID of task. If not valid, use the current task. .br .I count - Number of pages. .br .I pages - List of pages to move. .br .I nodes - List of nodes to which pages can be moved. .br .I status - Field to which status is to be returned. .br .I flags - MPOL_MF_MOVE or MPOL_MF_MOVE_ALL .br See move_pages(2). .BR numa_migrate_pages() simply uses the migrate_pages system call to cause the pages of the calling task, or a specified task, to be migated from one set of nodes to another. See migrate_pages(2). The bit masks representing the nodes should be allocated with .BR numa_allocate_nodemask() , or with .BR numa_bitmask_alloc() using an .I n value returned from .BR numa_num_possible_nodes(). A task's current node set can be gotten by calling .BR numa_get_membind(). Bits in the .I tonodes mask can be set by calls to .BR numa_bitmask_setbit(). .BR numa_error () is a .I libnuma internal function that can be overridden by the user program. This function is called with a .I char * argument when a .I libnuma function fails. Overriding the library internal definition makes it possible to specify a different error handling strategy when a .I libnuma function fails. It does not affect .BR numa_available (). The .BR numa_error () function defined in .I libnuma prints an error on .I stderr and terminates the program if .I numa_exit_on_error is set to a non-zero value. The default value of .I numa_exit_on_error is zero. .BR numa_warn () is a .I libnuma internal function that can be also overridden by the user program. It is called to warn the user when a .I libnuma function encounters a non-fatal error. The default implementation prints a warning to .IR stderr . The first argument is a unique number identifying each warning. After that there is a .BR printf (3)-style format string and a variable number of arguments. .I numa_warn exits the program when .I numa_exit_on_warn is set to a non-zero value. The default value of .I numa_exit_on_warn is zero. .SH Compatibility with libnuma version 1 Binaries that were compiled for libnuma version 1 need not be re-compiled to run with libnuma version 2. .br Source codes written for libnuma version 1 may be re-compiled without change with version 2 installed. To do so, in the code's Makefile add this option to CFLAGS: -DNUMA_VERSION1_COMPATIBILITY .SH THREAD SAFETY .I numa_set_bind_policy and .I numa_exit_on_error are process global. The other calls are thread safe. .SH COPYRIGHT Copyright 2002, 2004, 2007, 2008 Andi Kleen, SuSE Labs. .I libnuma is under the GNU Lesser General Public License, v2.1. .SH SEE ALSO .BR get_mempolicy (2), .BR set_mempolicy (2), .BR getpagesize (2), .BR mbind (2), .BR mmap (2), .BR shmat (2), .BR numactl (8), .BR sched_getaffinity (2) .BR sched_setaffinity (2) .BR move_pages (2) .BR migrate_pages (2) 07070100000024000081A40000000000000000000000016319106A000036D3000000000000000000000000000000000000002100000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _NUMA_H #define _NUMA_H 1 /* allow an application to test for the current programming interface: */ #define LIBNUMA_API_VERSION 2 /* Simple NUMA policy library */ #include <stddef.h> #include <string.h> #include <sys/types.h> #include <stdlib.h> #if defined(__x86_64__) || defined(__i386__) #define NUMA_NUM_NODES 128 #else #define NUMA_NUM_NODES 2048 #endif #ifdef __cplusplus extern "C" { #endif typedef struct { unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)]; } nodemask_t; struct bitmask { unsigned long size; /* number of bits in the map */ unsigned long *maskp; }; /* operations on struct bitmask */ int numa_bitmask_isbitset(const struct bitmask *, unsigned int); struct bitmask *numa_bitmask_setall(struct bitmask *); struct bitmask *numa_bitmask_clearall(struct bitmask *); struct bitmask *numa_bitmask_setbit(struct bitmask *, unsigned int); struct bitmask *numa_bitmask_clearbit(struct bitmask *, unsigned int); unsigned int numa_bitmask_nbytes(struct bitmask *); unsigned int numa_bitmask_weight(const struct bitmask *); struct bitmask *numa_bitmask_alloc(unsigned int); void numa_bitmask_free(struct bitmask *); int numa_bitmask_equal(const struct bitmask *, const struct bitmask *); void copy_nodemask_to_bitmask(nodemask_t *, struct bitmask *); void copy_bitmask_to_nodemask(struct bitmask *, nodemask_t *); void copy_bitmask_to_bitmask(struct bitmask *, struct bitmask *); /* compatibility for codes that used them: */ static inline void nodemask_zero(nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; numa_bitmask_clearall(&tmp); } static inline void nodemask_zero_compat(nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; numa_bitmask_clearall(&tmp); } static inline void nodemask_set_compat(nodemask_t *mask, int node) { mask->n[node / (8*sizeof(unsigned long))] |= (1UL<<(node%(8*sizeof(unsigned long)))); } static inline void nodemask_clr_compat(nodemask_t *mask, int node) { mask->n[node / (8*sizeof(unsigned long))] &= ~(1UL<<(node%(8*sizeof(unsigned long)))); } static inline int nodemask_isset_compat(const nodemask_t *mask, int node) { if ((unsigned)node >= NUMA_NUM_NODES) return 0; if (mask->n[node / (8*sizeof(unsigned long))] & (1UL<<(node%(8*sizeof(unsigned long))))) return 1; return 0; } static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b) { struct bitmask tmp_a, tmp_b; tmp_a.maskp = (unsigned long *)a; tmp_a.size = sizeof(nodemask_t) * 8; tmp_b.maskp = (unsigned long *)b; tmp_b.size = sizeof(nodemask_t) * 8; return numa_bitmask_equal(&tmp_a, &tmp_b); } static inline int nodemask_equal_compat(const nodemask_t *a, const nodemask_t *b) { struct bitmask tmp_a, tmp_b; tmp_a.maskp = (unsigned long *)a; tmp_a.size = sizeof(nodemask_t) * 8; tmp_b.maskp = (unsigned long *)b; tmp_b.size = sizeof(nodemask_t) * 8; return numa_bitmask_equal(&tmp_a, &tmp_b); } /* NUMA support available. If this returns a negative value all other function in this library are undefined. */ int numa_available(void); /* Basic NUMA state */ /* Get max available node */ int numa_max_node(void); int numa_max_possible_node(void); /* Return preferred node */ int numa_preferred(void); /* Return node size and free memory */ long long numa_node_size64(int node, long long *freep); long long numa_node_size(int node, long long *freep); int numa_pagesize(void); /* Set with all nodes from which the calling process may allocate memory. Only valid after numa_available. */ extern struct bitmask *numa_all_nodes_ptr; /* Set with all nodes the kernel has exposed to userspace */ extern struct bitmask *numa_nodes_ptr; /* For source compatibility */ extern nodemask_t numa_all_nodes; /* Set with all cpus. */ extern struct bitmask *numa_all_cpus_ptr; /* Set with no nodes */ extern struct bitmask *numa_no_nodes_ptr; /* Source compatibility */ extern nodemask_t numa_no_nodes; /* Only run and allocate memory from a specific set of nodes. */ void numa_bind(struct bitmask *nodes); /* Set the NUMA node interleaving mask. 0 to turn off interleaving */ void numa_set_interleave_mask(struct bitmask *nodemask); /* Return the current interleaving mask */ struct bitmask *numa_get_interleave_mask(void); /* allocate a bitmask big enough for all nodes */ struct bitmask *numa_allocate_nodemask(void); static inline void numa_free_nodemask(struct bitmask *b) { numa_bitmask_free(b); } /* Some node to preferably allocate memory from for task. */ void numa_set_preferred(int node); /* Returns whether or not the platform supports MPOL_PREFERRED_MANY */ int numa_has_preferred_many(void); /* Set of nodes to preferably allocate memory from for task. */ void numa_set_preferred_many(struct bitmask *bitmask); /* Return preferred nodes */ struct bitmask *numa_preferred_many(void); /* Set local memory allocation policy for task */ void numa_set_localalloc(void); /* Only allocate memory from the nodes set in mask. 0 to turn off */ void numa_set_membind(struct bitmask *nodemask); /* Only allocate memory from the nodes set in mask. Optimize page placement with Linux kernel NUMA balancing if possible. 0 to turn off */ void numa_set_membind_balancing(struct bitmask *bmp); /* Return current membind */ struct bitmask *numa_get_membind(void); /* Return allowed memories [nodes] */ struct bitmask *numa_get_mems_allowed(void); int numa_get_interleave_node(void); /* NUMA memory allocation. These functions always round to page size and are relatively slow. */ /* Alloc memory page interleaved on nodes in mask */ void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask); /* Alloc memory page interleaved on all nodes. */ void *numa_alloc_interleaved(size_t size); /* Alloc memory located on node */ void *numa_alloc_onnode(size_t size, int node); /* Alloc memory on local node */ void *numa_alloc_local(size_t size); /* Allocation with current policy */ void *numa_alloc(size_t size); /* Change the size of a memory area preserving the memory policy */ void *numa_realloc(void *old_addr, size_t old_size, size_t new_size); /* Free memory allocated by the functions above */ void numa_free(void *mem, size_t size); /* Low level functions, primarily for shared memory. All memory processed by these must not be touched yet */ /* Interleave a memory area. */ void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask); /* Allocate a memory area on a specific node. */ void numa_tonode_memory(void *start, size_t size, int node); /* Allocate memory on a mask of nodes. */ void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask); /* Allocate a memory area on the current node. */ void numa_setlocal_memory(void *start, size_t size); /* Allocate memory area with current memory policy */ void numa_police_memory(void *start, size_t size); /* Run current task only on nodes in mask */ int numa_run_on_node_mask(struct bitmask *mask); /* Run current task on nodes in mask without any cpuset awareness */ int numa_run_on_node_mask_all(struct bitmask *mask); /* Run current task only on node */ int numa_run_on_node(int node); /* Return current mask of nodes the task can run on */ struct bitmask * numa_get_run_node_mask(void); /* When strict fail allocation when memory cannot be allocated in target node(s). */ void numa_set_bind_policy(int strict); /* Fail when existing memory has incompatible policy */ void numa_set_strict(int flag); /* maximum nodes (size of kernel nodemask_t) */ int numa_num_possible_nodes(void); /* maximum cpus (size of kernel cpumask_t) */ int numa_num_possible_cpus(void); /* nodes in the system */ int numa_num_configured_nodes(void); /* maximum cpus */ int numa_num_configured_cpus(void); /* maximum cpus allowed to current task */ int numa_num_task_cpus(void); int numa_num_thread_cpus(void); /* backward compatibility */ /* maximum nodes allowed to current task */ int numa_num_task_nodes(void); int numa_num_thread_nodes(void); /* backward compatibility */ /* allocate a bitmask the size of the kernel cpumask_t */ struct bitmask *numa_allocate_cpumask(void); static inline void numa_free_cpumask(struct bitmask *b) { numa_bitmask_free(b); } /* Convert node to CPU mask. -1/errno on failure, otherwise 0. */ int numa_node_to_cpus(int, struct bitmask *); void numa_node_to_cpu_update(void); /* report the node of the specified cpu. -1/errno on invalid cpu. */ int numa_node_of_cpu(int cpu); /* Report distance of node1 from node2. 0 on error.*/ int numa_distance(int node1, int node2); /* Error handling. */ /* This is an internal function in libnuma that can be overwritten by an user program. Default is to print an error to stderr and exit if numa_exit_on_error is true. */ void numa_error(char *where); /* When true exit the program when a NUMA system call (except numa_available) fails */ extern int numa_exit_on_error; /* Warning function. Can also be overwritten. Default is to print on stderr once. */ void numa_warn(int num, char *fmt, ...); /* When true exit the program on a numa_warn() call */ extern int numa_exit_on_warn; int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to); int numa_move_pages(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags); int numa_sched_getaffinity(pid_t, struct bitmask *); int numa_sched_setaffinity(pid_t, struct bitmask *); /* Convert an ascii list of nodes to a bitmask */ struct bitmask *numa_parse_nodestring(const char *); /* Convert an ascii list of nodes to a bitmask without current nodeset * dependency */ struct bitmask *numa_parse_nodestring_all(const char *); /* Convert an ascii list of cpu to a bitmask */ struct bitmask *numa_parse_cpustring(const char *); /* Convert an ascii list of cpu to a bitmask without current taskset * dependency */ struct bitmask *numa_parse_cpustring_all(const char *); /* * The following functions are for source code compatibility * with releases prior to version 2. * Such codes should be compiled with NUMA_VERSION1_COMPATIBILITY defined. */ static inline void numa_set_interleave_mask_compat(nodemask_t *nodemask) { struct bitmask tmp; tmp.maskp = (unsigned long *)nodemask; tmp.size = sizeof(nodemask_t) * 8; numa_set_interleave_mask(&tmp); } static inline nodemask_t numa_get_interleave_mask_compat(void) { struct bitmask *tp; nodemask_t mask; tp = numa_get_interleave_mask(); copy_bitmask_to_nodemask(tp, &mask); numa_bitmask_free(tp); return mask; } static inline void numa_bind_compat(nodemask_t *mask) { struct bitmask *tp; tp = numa_allocate_nodemask(); copy_nodemask_to_bitmask(mask, tp); numa_bind(tp); numa_bitmask_free(tp); } static inline void numa_set_membind_compat(nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; numa_set_membind(&tmp); } static inline nodemask_t numa_get_membind_compat(void) { struct bitmask *tp; nodemask_t mask; tp = numa_get_membind(); copy_bitmask_to_nodemask(tp, &mask); numa_bitmask_free(tp); return mask; } static inline void *numa_alloc_interleaved_subset_compat(size_t size, const nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; return numa_alloc_interleaved_subset(size, &tmp); } static inline int numa_run_on_node_mask_compat(const nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; return numa_run_on_node_mask(&tmp); } static inline nodemask_t numa_get_run_node_mask_compat(void) { struct bitmask *tp; nodemask_t mask; tp = numa_get_run_node_mask(); copy_bitmask_to_nodemask(tp, &mask); numa_bitmask_free(tp); return mask; } static inline void numa_interleave_memory_compat(void *mem, size_t size, const nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; numa_interleave_memory(mem, size, &tmp); } static inline void numa_tonodemask_memory_compat(void *mem, size_t size, const nodemask_t *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = sizeof(nodemask_t) * 8; numa_tonodemask_memory(mem, size, &tmp); } static inline int numa_sched_getaffinity_compat(pid_t pid, unsigned len, unsigned long *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = len * 8; return numa_sched_getaffinity(pid, &tmp); } static inline int numa_sched_setaffinity_compat(pid_t pid, unsigned len, unsigned long *mask) { struct bitmask tmp; tmp.maskp = (unsigned long *)mask; tmp.size = len * 8; return numa_sched_setaffinity(pid, &tmp); } static inline int numa_node_to_cpus_compat(int node, unsigned long *buffer, int buffer_len) { struct bitmask tmp; tmp.maskp = (unsigned long *)buffer; tmp.size = buffer_len * 8; return numa_node_to_cpus(node, &tmp); } /* end of version 1 compatibility functions */ /* * To compile an application that uses libnuma version 1: * add -DNUMA_VERSION1_COMPATIBILITY to your Makefile's CFLAGS */ #ifdef NUMA_VERSION1_COMPATIBILITY #include <numacompat1.h> #endif #ifdef __cplusplus } #endif #endif 07070100000025000081A40000000000000000000000016319106A000000D8000000000000000000000000000000000000002500000000numactl- exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ Name: numa Description: NUMA policy library Version: @VERSION@ Cflags: -I${includedir} Libs: -L${libdir} -lnuma Libs.Private: @LIBS@ 07070100000026000081A40000000000000000000000016319106A000004CF000000000000000000000000000000000000002800000000numactl- numa_set_interleave_mask(m) numa_set_interleave_mask_compat(m) #define numa_get_interleave_mask() numa_get_interleave_mask_compat() #define numa_bind(m) numa_bind_compat(m) #define numa_get_membind(m) numa_get_membind_compat(m) #define numa_set_membind(m) numa_set_membind_compat(m) #define numa_alloc_interleaved_subset(s,m) numa_alloc_interleaved_subset_compat(s,m) #define numa_run_on_node_mask(m) numa_run_on_node_mask_compat(m) #define numa_get_run_node_mask() numa_get_run_node_mask_compat() #define numa_interleave_memory(st,si,m) numa_interleave_memory_compat(st,si,m) #define numa_tonodemask_memory(st,si,m) numa_tonodemask_memory_compat(st,si,m) #define numa_sched_getaffinity(p,l,m) numa_sched_getaffinity_compat(p,l,m) #define numa_sched_setaffinity(p,l,m) numa_sched_setaffinity_compat(p,l,m) #define numa_node_to_cpus(n,b,bl) numa_node_to_cpus_compat(n,b,bl) #define nodemask_zero(m) nodemask_zero_compat(m) #define nodemask_set(m, n) nodemask_set_compat(m, n) #define nodemask_clr(m, n) nodemask_clr_compat(m, n) #define nodemask_isset(m, n) nodemask_isset_compat(m, n) #define nodemask_equal(a, b) nodemask_equal_compat(a, b) 07070100000027000081A40000000000000000000000016319106A000027ED000000000000000000000000000000000000002400000000numactl-\" t .\" Copyright 2003,2004 Andi Kleen, SuSE Labs. .\" .\" Permission is granted to make and distribute verbatim copies of this .\" manual provided the copyright notice and this permission notice are .\" preserved on all copies. .\" .\" Permission is granted to copy and distribute modified versions of this .\" manual under the conditions for verbatim copying, provided that the .\" entire resulting derived work is distributed under the terms of a .\" permission notice identical to this one. .\" .\" Since the Linux kernel and libraries are constantly changing, this .\" manual page may be incorrect or out-of-date. The author(s) assume no .\" responsibility for errors or omissions, or for damages resulting from .\" the use of the information contained herein. .\" .\" Formatted or processed versions of this manual, if unaccompanied by .\" the source, must acknowledge the copyright and authors of this work. .TH NUMACTL 8 "Mar 2004" "SuSE Labs" "Linux Administrator's Manual" .SH NAME numactl \- Control NUMA policy for processes or shared memory .SH SYNOPSIS .B numactl [ .B \-\-all ] [ .B \-\-balancing ] [ .B \-\-interleave nodes ] [ .B \-\-preferred node ] [ .B \-\-preferred-many nodes ] [ .B \-\-membind nodes ] [ .B \-\-cpunodebind nodes ] [ .B \-\-physcpubind cpus ] [ .B \-\-localalloc ] [\-\-] command {arguments ...} .br .B numactl \-\-show .br .B numactl \-\-hardware .br .B numactl [ .B \-\-huge ] [ .B \-\-offset offset ] [ .B \-\-shmmode shmmode ] [ .B \-\-length length ] [ .B \-\-strict ] .br [ .B \-\-shmid id ] .B \-\-shm shmkeyfile | .B \-\-file tmpfsfile .br [ .B \-\-touch ] [ .B \-\-dump ] [ .B \-\-dump-nodes ] memory policy .SH DESCRIPTION .B numactl runs processes with a specific NUMA scheduling or memory placement policy. The policy is set for command and inherited by all of its children. In addition it can set persistent policy for shared memory segments or files. .PP Use -- before command if using command options that could be confused with numactl options. .PP .I nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth. Relative .I nodes may be specified as +N,N,N or +N-N or +N,N-N and so forth. The + indicates that the node numbers are relative to the process' set of allowed nodes in its current cpuset. A !N-N notation indicates the inverse of N-N, in other words all nodes except N-N. If used with + notation, specify !+N-N. When .I same is specified the previous nodemask specified on the command line is used. all means all nodes in the current cpuset. .PP Instead of a number a node can also be: .TS tab(|); l l. netdev:DEV|The node connected to network device DEV. file:PATH |The node the block device of PATH. ip:HOST |The node of the network device of HOST block:PATH|The node of block device PATH pci:[seg:]bus:dev[:func]|The node of a PCI device. .TE Note that block resolves the kernel block device names only for udev names in /dev use .I file: .TP Policy settings are: .TP .B \-\-all, \-a Unset default cpuset awareness, so user can use all possible CPUs/nodes for following policy settings. .TP .B \-\-interleave=nodes, \-i nodes Set a memory interleave policy. Memory will be allocated using round robin on .I nodes. When memory cannot be allocated on the current interleave target fall back to other nodes. Multiple nodes may be specified on --interleave, --membind and --cpunodebind. .TP .B \-\-membind=nodes, \-m nodes Only allocate memory from nodes. Allocation will fail when there is not enough memory available on these nodes. .I nodes may be specified as noted above. .TP .B \-\-cpunodebind=nodes, \-N nodes Only execute .I command on the CPUs of .I nodes. Note that nodes may consist of multiple CPUs. .I nodes may be specified as noted above. .TP .B \-\-physcpubind=cpus, \-C cpus Only execute .I process on .I cpus. This accepts cpu numbers as shown in the .I processor fields of .I /proc/cpuinfo, or relative cpus as in relative to the current cpuset. You may specify "all", which means all cpus in the current cpuset. Physical .I cpus may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth. Relative .I cpus may be specified as +N,N,N or +N-N or +N,N-N and so forth. The + indicates that the cpu numbers are relative to the process' set of allowed cpus in its current cpuset. A !N-N notation indicates the inverse of N-N, in other words all cpus except N-N. If used with + notation, specify !+N-N. .TP .B \-\-localalloc, \-l Try to allocate on the current node of the process, but if memory cannot be allocated there fall back to other nodes. .TP .B \-\-preferred=node Preferably allocate memory on .I node, but if memory cannot be allocated there fall back to other nodes. This option takes only a single node number. Relative notation may be used. .TP .B \-\-balancing, \-b Enable Linux kernel NUMA balancing for the process if it is supported by kernel. This should only be used with .I \-\-membind, \-m only, otherwise ignored. .TP .B \-\-preferred-many=node Preferably allocate memory on .I nodes, but if memory cannot be allocated there fall back to other nodes. This option takes a mask of preferred nodes where the closest node to local is considered most preferred. Relative notation may be used. .TP .B \-\-show, \-s Show NUMA policy settings of the current process. .TP .B \-\-hardware, \-H Show inventory of available nodes on the system. .TP 0 Numactl can set up policy for a SYSV shared memory segment or a file in shmfs/hugetlbfs. This policy is persistent and will be used by all mappings from that shared memory. The order of options matters here. The specification must at least include either of .I \-\-shm, .I \-\-shmid, .I \-\-file to specify the shared memory segment or file and a memory policy like described above ( .I \-\-interleave, .I \-\-localalloc, .I \-\-preferred, .I \-\-preferred-many, .I \-\-membind ). .TP .B \-\-huge When creating a SYSV shared memory segment use huge pages. Only valid before \-\-shmid or \-\-shm .TP .B \-\-offset Specify offset into the shared memory segment. Default 0. Valid units are .I m (for MB), .I g (for GB), .I k (for KB), otherwise it specifies bytes. .TP .B \-\-strict Give an error when a page in the policied area in the shared memory segment already was faulted in with a conflicting policy. Default is to silently ignore this. .TP .B \-\-shmmode shmmode Only valid before \-\-shmid or \-\-shm When creating a shared memory segment set it to numeric mode .I shmmode. .TP .B \-\-length length Apply policy to .I length range in the shared memory segment or make the segment length long Default is to use the remaining length Required when a shared memory segment is created and specifies the length of the new segment then. Valid units are .I m (for MB), .I g (for GB), .I k (for KB), otherwise it specifies bytes. .TP .B \-\-shmid id Create or use a shared memory segment with numeric ID .I id .TP .B \-\-shm shmkeyfile Create or use a shared memory segment, with the ID generated using .I ftok(3) from shmkeyfile .TP .B \-\-file tmpfsfile Set policy for a file in tmpfs or hugetlbfs .TP .B \-\-touch Touch pages to enforce policy early. Default is to not touch them, the policy is applied when an applications maps and accesses a page. .TP .B \-\-dump Dump policy in the specified range. .TP .B \-\-dump-nodes Dump all nodes of the specific range (very verbose!) .TP Valid node specifiers .TS tab(:); l l. all:All nodes number:Node number number1{,number2}:Node number1 and Node number2 number1-number2:Nodes from number1 to number2 ! nodes:Invert selection of the following specification. .TE .SH EXAMPLES numactl \-\-physcpubind=+0-4,8-12 myapplic arguments Run myapplic on cpus 0-4 and 8-12 of the current cpuset. numactl \-\-interleave=all bigdatabase arguments Run big database with its memory interleaved on all CPUs. numactl \-\-cpunodebind=0 \-\-membind=0,1 process Run process on node 0 with memory allocated on node 0 and 1. numactl \-\-cpunodebind=0 \-\-membind=0,1 -- process -l Run process as above, but with an option (-l) that would be confused with a numactl option. numactl \-\-cpunodebind=0 \-\-balancing \-\-membind=0,1 process Run process on node 0 with memory allocated on node 0 and 1. Optimize the page placement with Linux kernel NUMA balancing mechanism if possible. numactl \-\-cpunodebind=netdev:eth0 \-\-membind=netdev:eth0 network-server Run network-server on the node of network device eth0 with its memory also in the same node. numactl \-\-preferred=1 numactl \-\-show Set preferred node 1 and show the resulting state. numactl \-\-preferred-many=0x3 numactl \-\-show Set preferred nodes 1 and 2, and show the resulting state. numactl --interleave=all --shm /tmp/shmkey Interleave all of the sysv shared memory region specified by /tmp/shmkey over all nodes. Place a tmpfs file on 2 nodes: numactl --membind=2 dd if=/dev/zero of=/dev/shm/A bs=1M count=1024 numactl --membind=3 dd if=/dev/zero of=/dev/shm/A seek=1024 bs=1M count=1024 numactl --localalloc /dev/shm/file Reset the policy for the shared memory file .I file to the default localalloc policy. .SH NOTES Requires a NUMA policy aware kernel. Command is not executed using a shell. If you want to use shell metacharacters in the child use sh -c as wrapper. Setting policy for a hugetlbfs file does currently not work because it cannot be extended by truncate. Shared memory segments larger than numactl's address space cannot be completely policied. This could be a problem on 32bit architectures. Changing it piece by piece may work. The old .I --cpubind which accepts node numbers, not cpu numbers, is deprecated and replaced with the new .I --cpunodebind and .I --physcpubind options. .SH FILES .I /proc/cpuinfo for the listing of active CPUs. See .I proc(5) for details. .I /sys/devices/system/node/node*/numastat for NUMA memory hit statistics. .SH COPYRIGHT Copyright 2002,2004 Andi Kleen, SuSE Labs. numactl and the demo programs are under the GNU General Public License, v.2 .SH SEE ALSO .I set_mempolicy(2) , .I get_mempolicy(2) , .I mbind(2) , .I sched_setaffinity(2) , .I sched_getaffinity(2) , .I proc(5) , .I ftok(3) , .I shmat(2) , .I migratepages(8) 07070100000028000081ED0000000000000000000000016319106A00003DFB000000000000000000000000000000000000002400000000numactl-* Copyright (C) 2003,2004,2005 Andi Kleen, SuSE Labs. Command line NUMA policy control. numactl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numactl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _GNU_SOURCE #include <getopt.h> #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <stdarg.h> #include <ctype.h> #include "numa.h" #include "numaif.h" #include "numaint.h" #include "util.h" #include "shm.h" #define CPUSET 0 #define ALL 1 int exitcode; static struct option opts[] = { {"all", 0, 0, 'a'}, {"interleave", 1, 0, 'i' }, {"preferred", 1, 0, 'p' }, {"preferred-many", 1, 0, 'P' }, {"cpubind", 1, 0, 'c' }, {"cpunodebind", 1, 0, 'N' }, {"physcpubind", 1, 0, 'C' }, {"membind", 1, 0, 'm'}, {"show", 0, 0, 's' }, {"localalloc", 0,0, 'l'}, {"balancing", 0, 0, 'b'}, {"hardware", 0,0,'H' }, {"shm", 1, 0, 'S'}, {"file", 1, 0, 'f'}, {"offset", 1, 0, 'o'}, {"length", 1, 0, 'L'}, {"strict", 0, 0, 't'}, {"shmmode", 1, 0, 'M'}, {"dump", 0, 0, 'd'}, {"dump-nodes", 0, 0, 'D'}, {"shmid", 1, 0, 'I'}, {"huge", 0, 0, 'u'}, {"touch", 0, 0, 'T'}, {"verify", 0, 0, 'V'}, /* undocumented - for debugging */ { 0 } }; static void usage(void) { fprintf(stderr, "usage: numactl [--all | -a] [--balancing | -b] [--interleave= | -i <nodes>]\n" " [--preferred= | -p <node>] [--preferred-many= | -P <nodes>]\n" " [--physcpubind= | -C <cpus>] [--cpunodebind= | -N <nodes>]\n" " [--membind= | -m <nodes>] [--localalloc | -l] command args ...\n" " [--localalloc | -l] command args ...\n" " numactl [--show | -s]\n" " numactl [--hardware | -H]\n" " numactl [--length | -L <length>] [--offset | -o <offset>] [--shmmode | -M <shmmode>]\n" " [--strict | -t]\n" " [--shmid | -I <id>] --shm | -S <shmkeyfile>\n" " [--shmid | -I <id>] --file | -f <tmpfsfile>\n" " [--huge | -u] [--touch | -T] \n" " memory policy [--dump | -d] [--dump-nodes | -D]\n" "\n" "memory policy is --interleave | -i, --preferred | -p, --membind | -m, --localalloc | -l\n" "<nodes> is a comma delimited list of node numbers or A-B ranges or all.\n" "Instead of a number a node can also be:\n" " netdev:DEV the node connected to network device DEV\n" " file:PATH the node the block device of path is connected to\n" " ip:HOST the node of the network device host routes through\n" " block:PATH the node of block device path\n" " pci:[seg:]bus:dev[:func] The node of a PCI device\n" "<cpus> is a comma delimited list of cpu numbers or A-B ranges or all\n" "all ranges can be inverted with !\n" "all numbers and ranges can be made cpuset-relative with +\n" "the old --cpubind argument is deprecated.\n" "use --cpunodebind or --physcpubind instead\n" "use --balancing | -b to enable Linux kernel NUMA balancing\n" "for the process if it is supported by kernel\n" "<length> can have g (GB), m (MB) or k (KB) suffixes\n"); exit(1); } static void usage_msg(char *msg, ...) { va_list ap; va_start(ap,msg); fprintf(stderr, "numactl: "); vfprintf(stderr, msg, ap); putchar('\n'); usage(); va_end(ap); } static void show_physcpubind(void) { int ncpus = numa_num_configured_cpus(); for (;;) { struct bitmask *cpubuf; cpubuf = numa_bitmask_alloc(ncpus); if (numa_sched_getaffinity(0, cpubuf) < 0) { if (errno == EINVAL && ncpus < 1024*1024) { ncpus *= 2; continue; } err("sched_get_affinity"); } printmask("physcpubind", cpubuf); break; } } static void show(void) { struct bitmask *membind, *interleave, *cpubind, *preferred; unsigned long cur; int policy; if (numa_available() < 0) { show_physcpubind(); printf("No NUMA support available on this system.\n"); exit(1); } cpubind = numa_get_run_node_mask(); preferred = numa_preferred_many(); interleave = numa_get_interleave_mask(); membind = numa_get_membind(); cur = numa_get_interleave_node(); policy = 0; if (get_mempolicy(&policy, NULL, 0, 0, 0) < 0) perror("get_mempolicy"); printf("policy: %s\n", policy_name(policy)); printf("preferred node: "); switch (policy) { case MPOL_PREFERRED: if (numa_bitmask_weight(preferred)) printf("%d\n", find_first(preferred)); else printf("%d\n", 0); break; case MPOL_DEFAULT: printf("current\n"); break; case MPOL_INTERLEAVE: printf("%ld (interleave next)\n",cur); break; case MPOL_BIND: printf("%d\n", find_first(membind)); break; case MPOL_PREFERRED_MANY: printf("%ld (preferred-many)\n",cur); break; } if (policy == MPOL_INTERLEAVE) { printmask("interleavemask", interleave); printf("interleavenode: %ld\n", cur); } show_physcpubind(); printmask("cpubind", cpubind); // for compatibility printmask("nodebind", cpubind); printmask("membind", membind); printmask("preferred", preferred); } static char *fmt_mem(unsigned long long mem, char *buf) { if (mem == -1L) sprintf(buf, "<not available>"); else sprintf(buf, "%llu MB", mem >> 20); return buf; } static void print_distances(int maxnode) { int i,k; int fst = 0; for (i = 0; i <= maxnode; i++) if (numa_bitmask_isbitset(numa_nodes_ptr, i)) { fst = i; break; } if (numa_distance(maxnode,fst) == 0) { printf("No distance information available.\n"); return; } printf("node distances:\n"); printf("node "); for (i = 0; i <= maxnode; i++) if (numa_bitmask_isbitset(numa_nodes_ptr, i)) printf("% 3d ", i); printf("\n"); for (i = 0; i <= maxnode; i++) { if (!numa_bitmask_isbitset(numa_nodes_ptr, i)) continue; printf("% 3d: ", i); for (k = 0; k <= maxnode; k++) if (numa_bitmask_isbitset(numa_nodes_ptr, i) && numa_bitmask_isbitset(numa_nodes_ptr, k)) printf("% 3d ", numa_distance(i,k)); printf("\n"); } } static void print_node_cpus(int node) { int i, err; struct bitmask *cpus; cpus = numa_allocate_cpumask(); err = numa_node_to_cpus(node, cpus); if (err >= 0) { for (i = 0; i < cpus->size; i++) if (numa_bitmask_isbitset(cpus, i)) printf(" %d", i); } putchar('\n'); } static void hardware(void) { int i; int numnodes=0; int prevnode=-1; int skip=0; int maxnode = numa_max_node(); if (numa_available() < 0) { printf("No NUMA available on this system\n"); exit(1); } for (i=0; i<=maxnode; i++) if (numa_bitmask_isbitset(numa_nodes_ptr, i)) numnodes++; printf("available: %d nodes (", numnodes); for (i=0; i<=maxnode; i++) { if (numa_bitmask_isbitset(numa_nodes_ptr, i)) { if (prevnode == -1) { printf("%d", i); prevnode=i; continue; } if (i > prevnode + 1) { if (skip) { printf("%d", prevnode); skip=0; } printf(",%d", i); prevnode=i; continue; } if (i == prevnode + 1) { if (!skip) { printf("-"); skip=1; } prevnode=i; } if ((i == maxnode) && skip) printf("%d", prevnode); } } printf(")\n"); for (i = 0; i <= maxnode; i++) { char buf[64]; long long fr; unsigned long long sz = numa_node_size64(i, &fr); if (!numa_bitmask_isbitset(numa_nodes_ptr, i)) continue; printf("node %d cpus:", i); print_node_cpus(i); printf("node %d size: %s\n", i, fmt_mem(sz, buf)); printf("node %d free: %s\n", i, fmt_mem(fr, buf)); } print_distances(maxnode); } static void checkerror(char *s) { if (errno) { perror(s); exit(1); } } static void checknuma(void) { static int numa = -1; if (numa < 0) { if (numa_available() < 0) complain("This system does not support NUMA policy"); } numa = 0; } int set_policy = -1; static inline void setpolicy(int pol) { if (set_policy != -1) usage_msg("Conflicting policies"); set_policy = pol; } static inline void nopolicy(void) { if (set_policy >= 0) usage_msg("specify policy after --shm/--file"); } static int shmattached = 0; static int did_node_cpu_parse = 0; static char *shmoption; static inline void check_cpubind(int flag) { if (flag) usage_msg("cannot do --cpubind on shared memory\n"); } static inline void noshm(char *opt) { if (shmattached) usage_msg("%s must be before shared memory specification", opt); shmoption = opt; } static inline void dontshm(char *opt) { if (shmoption) usage_msg("%s shm option is not allowed before %s", shmoption, opt); } static inline void needshm(char *opt) { if (!shmattached) usage_msg("%s must be after shared memory specification", opt); } static inline void check_all_parse(int flag) { if (did_node_cpu_parse) usage_msg("--all/-a option must be before all cpu/node specifications"); } static void get_short_opts(struct option *o, char *s) { *s++ = '+'; while (o->name) { if (isprint(o->val)) { *s++ = o->val; if (o->has_arg) *s++ = ':'; } o++; } *s = '\0'; } static void check_shmbeyond(char *msg) { if (shmoffset >= shmlen) { fprintf(stderr, "numactl: region offset %#llx beyond its length %#llx at %s\n", shmoffset, shmlen, msg); exit(1); } } static struct bitmask *numactl_parse_nodestring(char *s, int flag) { static char *last; if (s[0] == 's' && !strcmp(s, "same")) { if (!last) usage_msg("same needs previous node specification"); s = last; } else { last = s; } if (flag == ALL) return numa_parse_nodestring_all(s); else return numa_parse_nodestring(s); } int main(int ac, char **av) { int c; long node=-1; char *end; char shortopts[array_len(opts)*2 + 1]; struct bitmask *mask = NULL; int did_cpubind = 0; int did_strict = 0; int do_shm = 0; int do_dump = 0; int parse_all = 0; int numa_balancing = 0; get_short_opts(opts,shortopts); while ((c = getopt_long(ac, av, shortopts, opts, NULL)) != -1) { switch (c) { case 's': /* --show */ show(); exit(0); case 'H': /* --hardware */ nopolicy(); hardware(); exit(0); case 'b': /* --balancing */ nopolicy(); numa_balancing = 1; break; case 'i': /* --interleave */ checknuma(); if (parse_all) mask = numactl_parse_nodestring(optarg, ALL); else mask = numactl_parse_nodestring(optarg, CPUSET); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; did_node_cpu_parse = 1; setpolicy(MPOL_INTERLEAVE); if (shmfd >= 0) numa_interleave_memory(shmptr, shmlen, mask); else numa_set_interleave_mask(mask); checkerror("setting interleave mask"); break; case 'N': /* --cpunodebind */ case 'c': /* --cpubind */ dontshm("-c/--cpubind/--cpunodebind"); checknuma(); if (parse_all) mask = numactl_parse_nodestring(optarg, ALL); else mask = numactl_parse_nodestring(optarg, CPUSET); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; did_node_cpu_parse = 1; numa_run_on_node_mask_all(mask); checkerror("sched_setaffinity"); break; case 'C': /* --physcpubind */ { struct bitmask *cpubuf; dontshm("-C/--physcpubind"); if (parse_all) cpubuf = numa_parse_cpustring_all(optarg); else cpubuf = numa_parse_cpustring(optarg); if (!cpubuf) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; did_node_cpu_parse = 1; numa_sched_setaffinity(0, cpubuf); checkerror("sched_setaffinity"); numa_bitmask_free(cpubuf); break; } case 'm': /* --membind */ checknuma(); setpolicy(MPOL_BIND); if (parse_all) mask = numactl_parse_nodestring(optarg, ALL); else mask = numactl_parse_nodestring(optarg, CPUSET); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; did_node_cpu_parse = 1; numa_set_bind_policy(1); if (shmfd >= 0) { numa_tonodemask_memory(shmptr, shmlen, mask); } else if (numa_balancing) { numa_set_membind_balancing(mask); } else { numa_set_membind(mask); } numa_set_bind_policy(0); checkerror("setting membind"); break; case 'P': /* --preferred-many */ if (!numa_has_preferred_many()) complain("preferred-many requested without kernel support"); case 'p': /* --preferred */ checknuma(); if (parse_all) mask = numactl_parse_nodestring(optarg, ALL); else mask = numactl_parse_nodestring(optarg, CPUSET); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; did_node_cpu_parse = 1; numa_set_bind_policy(0); if (shmfd >= 0) { numa_tonode_memory(shmptr, shmlen, node); } else if (c == 'p') { if (numa_bitmask_weight(mask) != 1) usage(); setpolicy(MPOL_PREFERRED); numa_set_preferred(find_first(mask)); } else { setpolicy(MPOL_PREFERRED_MANY); numa_set_preferred_many(mask); } checkerror("setting preferred node"); break; case 'l': /* --local */ checknuma(); setpolicy(MPOL_LOCAL); errno = 0; if (shmfd >= 0) numa_setlocal_memory(shmptr, shmlen); else numa_set_localalloc(); checkerror("local allocation"); break; case 'S': /* --shm */ check_cpubind(did_cpubind); nopolicy(); attach_sysvshm(optarg, "--shm"); shmattached = 1; break; case 'f': /* --file */ check_cpubind(did_cpubind); nopolicy(); attach_shared(optarg, "--file"); shmattached = 1; break; case 'L': /* --length */ noshm("--length"); shmlen = memsize(optarg); break; case 'M': /* --shmmode */ noshm("--shmmode"); shmmode = strtoul(optarg, &end, 8); if (end == optarg || *end) usage(); break; case 'd': /* --dump */ if (shmfd < 0) complain( "Cannot do --dump without shared memory.\n"); dump_shm(); do_dump = 1; break; case 'D': /* --dump-nodes */ if (shmfd < 0) complain( "Cannot do --dump-nodes without shared memory.\n"); dump_shm_nodes(); do_dump = 1; break; case 't': /* --strict */ did_strict = 1; numa_set_strict(1); break; case 'I': /* --shmid */ shmid = strtoul(optarg, &end, 0); if (end == optarg || *end) usage(); break; case 'u': /* --huge */ noshm("--huge"); shmflags |= SHM_HUGETLB; break; case 'o': /* --offset */ noshm("--offset"); shmoffset = memsize(optarg); break; case 'T': /* --touch */ needshm("--touch"); check_shmbeyond("--touch"); numa_police_memory(shmptr, shmlen); break; case 'V': /* --verify */ needshm("--verify"); if (set_policy < 0) complain("Need a policy first to verify"); check_shmbeyond("--verify"); numa_police_memory(shmptr, shmlen); if (!mask) complain("Need a mask to verify"); else verify_shm(set_policy, mask); break; case 'a': /* --all */ check_all_parse(did_node_cpu_parse); parse_all = 1; break; default: usage(); } } numa_bitmask_free(mask); av += optind; ac -= optind; if (shmfd >= 0) { if (*av) usage(); exit(exitcode); } if (did_strict) fprintf(stderr, "numactl: warning. Strict flag for process ignored.\n"); if (do_dump) usage_msg("cannot do --dump|--dump-shm for process"); if (shmoption) usage_msg("shm related option %s for process", shmoption); if (*av == NULL) usage(); execvp(*av, av); complain("execution of `%s': %s\n", av[0], strerror(errno)); return 0; /* not reached */ } 07070100000029000081A40000000000000000000000016319106A000007BE000000000000000000000000000000000000002700000000numactl- numactl Summary: Library for tuning for Non Uniform Memory Access machines Version: 2.0.12 Release: 3%{dist} # libnuma is LGPLv2 and GPLv2 # numactl binaries are GPLv2 only License: GPLv2 URL: Source0: BuildRequires: libtool automake autoconf ExcludeArch: s390 %{arm} %description Simple NUMA policy support. It consists of a numactl program to run other programs with a specific NUMA policy. %package libs Summary: libnuma libraries # There is a tiny bit of GPLv2 code in libnuma.c License: LGPLv2 and GPLv2 %description libs numactl-libs provides libnuma, a library to do allocations with NUMA policy in applications. %package devel Summary: Development package for building Applications that use numa Requires: %{name}-libs = %{version}-%{release} License: LGPLv2 and GPLv2 %description devel Provides development headers for numa library calls %prep %setup -q -n %{name}-%{version} %build %configure --prefix=/usr --libdir=%{_libdir} # Using recipe to fix rpaths, from here: # sed -i -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \ -e 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool make clean make CFLAGS="$RPM_OPT_FLAGS -I." %install rm -rf $RPM_BUILD_ROOT make DESTDIR=$RPM_BUILD_ROOT install %ldconfig_scriptlets %ldconfig_scriptlets libs %files %doc %{_bindir}/numactl %{_bindir}/numademo %{_bindir}/numastat %{_bindir}/memhog %{_bindir}/migspeed %{_bindir}/migratepages %{_mandir}/man8/*.8* %exclude %{_mandir}/man2/*.2* %files libs %{_libdir}/ %{_libdir}/ %files devel %{_libdir}/ %exclude %{_libdir}/libnuma.a %exclude %{_libdir}/ %{_libdir}/pkgconfig/numa.pc %{_includedir}/numa.h %{_includedir}/numaif.h %{_includedir}/numacompat1.h %{_mandir}/man3/*.3* 0707010000002A000081A40000000000000000000000016319106A0000360C000000000000000000000000000000000000002500000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. Test/demo program for libnuma. This is also a more or less useful benchmark of the NUMA characteristics of your machine. It benchmarks most possible NUMA policy memory configurations with various benchmarks. Compile standalone with cc -O2 numademo.c -o numademo -lnuma -lm numactl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numactl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _GNU_SOURCE 1 #include <stdio.h> #include <string.h> #include <stdlib.h> #include <ctype.h> #include <sys/time.h> #include "numa.h" #include "util.h" #ifdef HAVE_STREAM_LIB #include "stream_lib.h" #endif #ifdef HAVE_MT #include "mt.h" #endif #ifdef HAVE_CLEAR_CACHE #include "clearcache.h" #else static inline void clearcache(void *a, unsigned size) {} #endif #define FRACT_NODES 8 #define FRACT_MASKS 32 static int fract_nodes; static int *node_to_use; static unsigned long msize; /* Should get this from cpuinfo, but on !x86 it's not there */ enum { CACHELINESIZE = 64, }; enum test { MEMSET = 0, MEMCPY, FORWARD, BACKWARD, STREAM, RANDOM2, PTRCHASE, } thistest; static char *delim = " "; static int regression_testing=0; static char *testname[] = { "memset", "memcpy", "forward", "backward", #ifdef HAVE_STREAM_LIB "stream", #endif #ifdef HAVE_MT "random2", #endif "ptrchase", NULL, }; static void output(char *title, char *result) { if (!isspace(delim[0])) printf("%s%s%s\n", title,delim, result); else printf("%-42s%s\n", title, result); } #ifdef HAVE_STREAM_LIB static void do_stream(char *name, unsigned char *mem) { int i; char title[100], buf[100]; double res[STREAM_NRESULTS]; stream_verbose = 0; clearcache(mem, msize); stream_init(mem); stream_test(res); sprintf(title, "%s%s%s", name, delim, "STREAM"); buf[0] = '\0'; for (i = 0; i < STREAM_NRESULTS; i++) { if (buf[0]) strcat(buf,delim); sprintf(buf+strlen(buf), "%s%s%.2f%sMB/s", stream_names[i], delim, res[i], delim); } output(title, buf); clearcache(mem, msize); } #endif /* Set up a randomly distributed list to fool prefetchers */ union node { union node *next; struct { unsigned nexti; unsigned val; }; }; static int cmp_node(const void *ap, const void *bp) { union node *a = (union node *)ap; union node *b = (union node *)bp; return a->val - b->val; } static void **ptrchase_init(unsigned char *mem) { long i; union node *nodes = (union node *)mem; long nmemb = msize / sizeof(union node); srand(1234); for (i = 0; i < nmemb; i++) { nodes[i].val = rand(); nodes[i].nexti = i + 1; } qsort(nodes, nmemb, sizeof(union node), cmp_node); for (i = 0; i < nmemb; i++) { union node *n = &nodes[i]; n->next = n->nexti >= nmemb ? NULL : &nodes[n->nexti]; } return (void **)nodes; } static inline unsigned long long timerfold(struct timeval *tv) { return tv->tv_sec * 1000000ULL + tv->tv_usec; } #define LOOPS 10 static void memtest(char *name, unsigned char *mem) { long k; struct timeval start, end, res; unsigned long long max, min, sum, r; int i; char title[128], result[128]; if (!mem) { fprintf(stderr, "Failed to allocate %lu bytes of memory. Test \"%s\" exits.\n", msize, name); return; } #ifdef HAVE_STREAM_LIB if (thistest == STREAM) { do_stream(name, mem); goto out; } #endif max = 0; min = ~0UL; sum = 0; /* * Note: 0th pass allocates the pages, don't measure */ for (i = 0; i < LOOPS+1; i++) { clearcache(mem, msize); switch (thistest) { case PTRCHASE: { void **ptr; ptr = ptrchase_init(mem); gettimeofday(&start,NULL); while (*ptr) ptr = (void **)*ptr; gettimeofday(&end,NULL); /* Side effect to trick the optimizer */ *ptr = "bla"; break; } case MEMSET: gettimeofday(&start,NULL); memset(mem, 0xff, msize); gettimeofday(&end,NULL); break; case MEMCPY: gettimeofday(&start,NULL); memcpy(mem, mem + msize/2, msize/2); gettimeofday(&end,NULL); break; case FORWARD: /* simple kernel to just fetch cachelines and write them back. will trigger hardware prefetch */ gettimeofday(&start,NULL); for (k = 0; k < msize; k+=CACHELINESIZE) mem[k]++; gettimeofday(&end,NULL); break; case BACKWARD: gettimeofday(&start,NULL); for (k = msize-5; k > 0; k-=CACHELINESIZE) mem[k]--; gettimeofday(&end,NULL); break; #ifdef HAVE_MT case RANDOM2: { unsigned * __restrict m = (unsigned *)mem; unsigned max = msize / sizeof(unsigned); unsigned mask; mt_init(); mask = 1; while (mask < max) mask = (mask << 1) | 1; /* * There's no guarantee all memory is touched, but * we assume (hope) that the distribution of the MT * is good enough to touch most. */ gettimeofday(&start,NULL); for (k = 0; k < max; k++) { unsigned idx = mt_random() & mask; if (idx >= max) idx -= max; m[idx]++; } gettimeofday(&end,NULL); } #endif default: break; } if (!i) continue; /* don't count allocation pass */ timersub(&end, &start, &res); r = timerfold(&res); if (r > max) max = r; if (r < min) min = r; sum += r; } sprintf(title, "%s%s%s", name, delim, testname[thistest]); #define H(t) (((double)msize) / ((double)t)) #define D3 delim,delim,delim sprintf(result, "Avg%s%.2f%sMB/s%sMax%s%.2f%sMB/s%sMin%s%.2f%sMB/s", delim, H(sum/LOOPS), D3, H(min), D3, H(max), delim); #undef H #undef D3 output(title,result); #ifdef HAVE_STREAM_LIB out: #endif /* Just to make sure that when we switch CPUs that the old guy doesn't still keep it around. */ clearcache(mem, msize); numa_free(mem, msize); } static int popcnt(unsigned long val) { int i = 0, cnt = 0; while (val >> i) { if ((1UL << i) & val) cnt++; i++; } return cnt; } static int numnodes; static int get_node_list(void) { int a, got_nodes = 0; long long free_node_sizes; int max_node; numnodes = numa_num_configured_nodes(); node_to_use = (int *)malloc(numnodes * sizeof(int)); max_node = numa_max_node(); for (a = 0; a <= max_node; a++) { if (numa_node_size(a, &free_node_sizes) > 0) node_to_use[got_nodes++] = a; } if(got_nodes != numnodes) return -1; return got_nodes; } static void test(enum test type) { unsigned long mask; int i, k; char buf[512]; struct bitmask *nodes; nodes = numa_allocate_nodemask(); thistest = type; if (regression_testing) { printf("\nTest %s doing 1 of %d nodes and 1 of %d masks.\n", testname[thistest], fract_nodes, FRACT_MASKS); } memtest("memory with no policy", numa_alloc(msize)); memtest("local memory", numa_alloc_local(msize)); memtest("memory interleaved on all nodes", numa_alloc_interleaved(msize)); for (i = 0; i < numnodes; i++) { if (regression_testing && (i % fract_nodes)) { /* for regression testing (-t) do only every eighth node */ continue; } sprintf(buf, "memory on node %d", node_to_use[i]); memtest(buf, numa_alloc_onnode(msize, node_to_use[i])); } for (mask = 1, i = 0; mask < (1UL<<numnodes); mask++, i++) { int w; char buf2[20]; if (popcnt(mask) == 1) continue; if (regression_testing && (i > 50)) { break; } if (regression_testing && (i % FRACT_MASKS)) { /* for regression testing (-t) do only every 32nd mask permutation */ continue; } numa_bitmask_clearall(nodes); for (w = 0; mask >> w; w++) { if ((mask >> w) & 1) numa_bitmask_setbit(nodes, w); } sprintf(buf, "memory interleaved on"); for (k = 0; k < numnodes; k++) if ((1UL<<node_to_use[k]) & mask) { sprintf(buf2, " %d", node_to_use[k]); strcat(buf, buf2); } memtest(buf, numa_alloc_interleaved_subset(msize, nodes)); if (!numa_has_preferred_many()) continue; sprintf(buf, "memory preferred on"); for (k = 0; k < numnodes; k++) if ((1UL<<node_to_use[k]) & mask) { sprintf(buf2, " %d", node_to_use[k]); strcat(buf, buf2); } numa_set_preferred_many(nodes); memtest(buf, numa_alloc(msize)); } for (i = 0; i < numnodes; i++) { if (regression_testing && (node_to_use[i] % fract_nodes)) { /* for regression testing (-t) do only every eighth node */ continue; } printf("setting preferred node to %d\n", node_to_use[i]); numa_set_preferred(node_to_use[i]); memtest("memory with preferred policy", numa_alloc(msize)); } numa_set_interleave_mask(numa_all_nodes_ptr); memtest("manual interleaving to all nodes", numa_alloc(msize)); if (numnodes > 0) { numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, node_to_use[0]); numa_bitmask_setbit(nodes, node_to_use[1]); numa_set_interleave_mask(nodes); memtest("manual interleaving on first two nodes", numa_alloc(msize)); printf("current interleave node %d\n", numa_get_interleave_node()); } numa_bitmask_free(nodes); numa_set_interleave_mask(numa_no_nodes_ptr); nodes = numa_allocate_nodemask(); for (i = 0; i < numnodes; i++) { int oldhn = numa_preferred(); if (regression_testing && (i % fract_nodes)) { /* for regression testing (-t) do only every eighth node */ continue; } numa_run_on_node(node_to_use[i]); printf("running on node %d, preferred node %d\n",node_to_use[i], oldhn); memtest("local memory", numa_alloc_local(msize)); memtest("memory interleaved on all nodes", numa_alloc_interleaved(msize)); if (numnodes >= 2) { numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, node_to_use[0]); numa_bitmask_setbit(nodes, node_to_use[1]); memtest("memory interleaved on first two nodes", numa_alloc_interleaved_subset(msize, nodes)); } for (k = 0; k < numnodes; k++) { if (node_to_use[k] == node_to_use[i]) continue; if (regression_testing && (node_to_use[k] % fract_nodes)) { /* for regression testing (-t) do only every eighth node */ continue; } sprintf(buf, "alloc on node %d", node_to_use[k]); numa_bitmask_clearall(nodes); numa_bitmask_setbit(nodes, node_to_use[k]); numa_set_membind(nodes); memtest(buf, numa_alloc(msize)); numa_set_membind(numa_all_nodes_ptr); } numa_set_localalloc(); memtest("local allocation", numa_alloc(msize)); #define set_pref_many(__i) do { \ numa_bitmask_clearall(nodes); \ numa_bitmask_setbit(nodes, __i); \ numa_set_preferred_many(nodes); \ } while (0) numa_set_preferred(node_to_use[(i + 1) % numnodes]); memtest("setting wrong preferred node", numa_alloc(msize)); numa_set_preferred(node_to_use[i]); memtest("setting correct preferred node", numa_alloc(msize)); if (numa_has_preferred_many()) { set_pref_many(node_to_use[(i + 1) % numnodes]); memtest("setting wrong preferred-many nodes", numa_alloc(msize)); set_pref_many(node_to_use[i]); memtest("setting correct preferred-many nodes", numa_alloc(msize)); } #undef set_pref_many numa_set_localalloc(); if (!delim[0]) printf("\n\n\n"); } numa_bitmask_free(nodes); /* numa_run_on_node_mask is not tested */ } static void usage(void) { int i; printf("usage: numademo [-S] [-f] [-c] [-e] [-t] msize[kmg] {tests}\nNo tests means run all.\n"); printf("-c output CSV data. -f run even without NUMA API. -S run stupid tests. -e exit on error\n"); printf("-t regression test; do not run all node combinations\n"); printf("valid tests:"); for (i = 0; testname[i]; i++) printf(" %s", testname[i]); putchar('\n'); exit(1); } int main(int ac, char **av) { int simple_tests = 0; int nr_nodes; int force = 0; while (av[1] && av[1][0] == '-') { ac--; switch (av[1][1]) { case 'c': delim = ","; break; case 'f': force = 1; break; case 'S': simple_tests = 1; break; case 'e': numa_exit_on_error = 1; numa_exit_on_warn = 1; break; case 't': regression_testing = 1; break; default: usage(); break; } ++av; } if (!av[1]) usage(); if (numa_available() < 0) { printf("your system does not support the numa API.\n"); if (!force) exit(1); } nr_nodes = get_node_list(); if(nr_nodes == -1){ fprintf(stderr, "Configured Nodes does not match available memory nodes\n"); exit(1); } if (nr_nodes < 2) { printf("A minimum of 2 nodes is required for this test.\n"); exit(77); } printf("%d nodes available\n", numnodes); fract_nodes = (((numnodes-1)/8)*2) + FRACT_NODES; if (numnodes <= 3) regression_testing = 0; /* set -t auto-off for small systems */ msize = memsize(av[1]); if (!msize) usage(); #ifdef HAVE_STREAM_LIB stream_setmem(msize); #endif if (av[2] == NULL) { test(MEMSET); test(MEMCPY); if (simple_tests) { test(FORWARD); test(BACKWARD); } #ifdef HAVE_MT test(RANDOM2); #endif #ifdef HAVE_STREAM_LIB test(STREAM); #endif if (msize >= sizeof(union node)) { test(PTRCHASE); } else { fprintf(stderr, "You must set msize at least %lu bytes for ptrchase test.\n", sizeof(union node)); exit(1); } } else { int k; for (k = 2; k < ac; k++) { int i; int found = 0; for (i = 0; testname[i]; i++) { if (!strcmp(testname[i],av[k])) { test(i); found = 1; break; } } if (!found) { fprintf(stderr,"unknown test `%s'\n", av[k]); usage(); } } } free(node_to_use); return 0; } 0707010000002B000081A40000000000000000000000016319106A0000069A000000000000000000000000000000000000002300000000numactl- NUMAIF_H #define NUMAIF_H 1 #ifdef __cplusplus extern "C" { #endif /* Kernel interface for NUMA API */ /* System calls */ extern long get_mempolicy(int *mode, unsigned long *nmask, unsigned long maxnode, void *addr, unsigned flags); extern long mbind(void *start, unsigned long len, int mode, const unsigned long *nmask, unsigned long maxnode, unsigned flags); extern long set_mempolicy(int mode, const unsigned long *nmask, unsigned long maxnode); extern long migrate_pages(int pid, unsigned long maxnode, const unsigned long *frommask, const unsigned long *tomask); extern long move_pages(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags); /* Policies */ #define MPOL_DEFAULT 0 #define MPOL_PREFERRED 1 #define MPOL_BIND 2 #define MPOL_INTERLEAVE 3 #define MPOL_LOCAL 4 #define MPOL_PREFERRED_MANY 5 #define MPOL_MAX 6 /* Flags for set_mempolicy, specified in mode */ #define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */ /* Flags for get_mem_policy */ #define MPOL_F_NODE (1<<0) /* return next il node or node of address */ /* Warning: MPOL_F_NODE is unsupported and subject to change. Don't use. */ #define MPOL_F_ADDR (1<<1) /* look up vma using address */ #define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ #ifdef __cplusplus } #endif #endif 0707010000002C000081A40000000000000000000000016319106A00000693000000000000000000000000000000000000002400000000numactl-* Internal interfaces of libnuma */ extern int numa_sched_setaffinity_v1(pid_t pid, unsigned len, const unsigned long *mask); extern int numa_sched_getaffinity_v1(pid_t pid, unsigned len, const unsigned long *mask); extern int numa_sched_setaffinity_v1_int(pid_t pid, unsigned len,const unsigned long *mask); extern int numa_sched_getaffinity_v1_int(pid_t pid, unsigned len,const unsigned long *mask); extern int numa_sched_setaffinity_v2(pid_t pid, struct bitmask *mask); extern int numa_sched_getaffinity_v2(pid_t pid, struct bitmask *mask); extern int numa_sched_setaffinity_v2_int(pid_t pid, struct bitmask *mask); extern int numa_sched_getaffinity_v2_int(pid_t pid, struct bitmask *mask); #define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ #define BITS_PER_LONG (sizeof(unsigned long) * 8) #define CPU_BYTES(x) (round_up(x, BITS_PER_LONG)/8) #define CPU_LONGS(x) (CPU_BYTES(x) / sizeof(long)) #define make_internal_alias(x) extern __typeof (x) x##_int __attribute((alias(#x), visibility("hidden"))) #define hidden __attribute__((visibility("hidden"))) enum numa_warn { W_nosysfs, W_noproc, W_badmeminfo, W_nosysfs2, W_cpumap, W_numcpus, W_noderunmask, W_distance, W_memory, W_cpuparse, W_nodeparse, W_blockdev1, W_blockdev2, W_blockdev3, W_blockdev4, W_blockdev5, W_netlink1, W_netlink2, W_netlink3, W_net1, W_net2, W_class1, W_class2, W_pci1, W_pci2, W_node_parse1, W_node_parse2, W_nonode, W_badchar, }; #define howmany(x,y) (((x)+((y)-1))/(y)) #define bitsperlong (8 * sizeof(unsigned long)) #define bitsperint (8 * sizeof(unsigned int)) #define longsperbits(n) howmany(n, bitsperlong) #define bytesperbits(x) ((x+7)/8) 0707010000002D000081A40000000000000000000000016319106A00001BE7000000000000000000000000000000000000002400000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. numamon is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numamon is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Display some numa statistics collected by the CPU. Opteron specific. Also not reliable because the counters are not quite correct in hardware. */ #define _LARGE_FILE_SOURCE 1 #define _GNU_SOURCE 1 #include <string.h> #include <errno.h> #include <stdio.h> #include <unistd.h> #include <dirent.h> #include <getopt.h> #include <stdarg.h> #include <stdlib.h> #include <sys/fcntl.h> enum { LOCALLOCAL = 0, LOCALREMOTE = 1, REMOTELOCAL = 2 }; static int mem[] = { [LOCALLOCAL] = 0xa8, [LOCALREMOTE] = 0x98, [REMOTELOCAL] = 0x68 }; static int io[] = { [LOCALLOCAL] = 0xa4, [LOCALREMOTE] = 0x94, [REMOTELOCAL] = 0x64 }; static int *masks = mem; #define err(x) perror(x),exit(1) #define PERFEVTSEL0 0xc0010000 #define PERFEVTSEL1 0xc0010001 #define PERFEVTSEL2 0xc0010002 #define PERFEVTSEL3 0xc0010003 #define PERFCTR0 0xc0010004 #define PERFCTR1 0xc0010005 #define PERFCTR2 0xc0010006 #define PERFCTR3 0xc0010007 #define EVENT 0xe9 #define PERFEVTSEL_EN (1 << 22) #define PERFEVTSEL_OS (1 << 17) #define PERFEVTSEL_USR (1 << 16) #define BASE (EVENT | PERFEVTSEL_EN | PERFEVTSEL_OS | PERFEVTSEL_USR) #define MAXCPU 8 int force = 0; int msrfd[MAXCPU]; int delay; int absolute; char *cfilter; int verbose; static void usage(void); static void Vprintf(char *fmt, ...) { va_list ap; va_start(ap,fmt); if (verbose) vfprintf(stderr,fmt,ap); va_end(ap); } static unsigned long long rdmsr(int cpu, unsigned long msr) { unsigned long long val; if (pread(msrfd[cpu], &val, 8, msr) != 8) { fprintf(stderr, "rdmsr of %lx failed: %s\n", msr, strerror(errno)); exit(1); } return val; } static void wrmsr(int cpu, unsigned long msr, unsigned long long value) { if (pwrite(msrfd[cpu], &value, 8, msr) != 8) { fprintf(stderr, "wdmsr of %lx failed: %s\n", msr, strerror(errno)); exit(1); } } static int cpufilter(int cpu) { long num; char *end; char *s; if (!cfilter) return 1; for (s = cfilter;;) { num = strtoul(s, &end, 0); if (end == s) usage(); if (cpu == num) return 1; if (*end == ',') s = end+1; else if (*end == 0) break; else usage(); } return 0; } static void checkcounter(int cpu, int clear) { int i; for (i = 1; i < 4; i++) { int clear_this = clear; unsigned long long evtsel = rdmsr(cpu, PERFEVTSEL0 + i); Vprintf("%d: %x %Lx\n", cpu, PERFEVTSEL0 + i, evtsel); if (!(evtsel & PERFEVTSEL_EN)) { Vprintf("reinit %d\n", cpu); wrmsr(cpu, PERFEVTSEL0 + i, BASE | masks[i - 1]); clear_this = 1; } else if (evtsel == (BASE | (masks[i-1] << 8))) { /* everything fine */ } else if (force) { Vprintf("reinit force %d\n", cpu); wrmsr(cpu, PERFEVTSEL0 + i, BASE | (masks[i - 1] << 8)); clear_this = 1; } else { fprintf(stderr, "perfctr %d cpu %d already used with %Lx\n", i, cpu, evtsel); fprintf(stderr, "Consider using -f if you know what you're doing.\n"); exit(1); } if (clear_this) { Vprintf("clearing %d\n", cpu); wrmsr(cpu, PERFCTR0 + i, 0); } } } static void setup(int clear) { DIR *dir; struct dirent *d; int numcpus = 0; memset(msrfd, -1, sizeof(msrfd)); dir = opendir("/dev/cpu"); if (!dir) err("cannot open /dev/cpu"); while ((d = readdir(dir)) != NULL) { char buf[64]; char *end; long cpunum = strtoul(d->d_name, &end, 0); if (*end != 0) continue; if (cpunum > MAXCPU) { fprintf(stderr, "too many cpus %ld %s\n", cpunum, d->d_name); continue; } if (!cpufilter(cpunum)) continue; snprintf(buf, 63, "/dev/cpu/%ld/msr", cpunum); msrfd[cpunum] = open64(buf, O_RDWR); if (msrfd[cpunum] < 0) continue; numcpus++; checkcounter(cpunum, clear); } closedir(dir); if (numcpus == 0) { fprintf(stderr, "No CPU found using MSR driver.\n"); exit(1); } } static void printf_padded(int pad, char *fmt, ...) { char buf[pad + 1]; va_list ap; va_start(ap, fmt); vsnprintf(buf, pad, fmt, ap); printf("%-*s", pad, buf); va_end(ap); } static void print_header(void) { printf_padded(4, "CPU "); printf_padded(16, "LOCAL"); printf_padded(16, "LOCAL->REMOTE"); printf_padded(16, "REMOTE->LOCAL"); putchar('\n'); } static void print_cpu(int cpu) { int i; static unsigned long long lastval[4]; printf_padded(4, "%d", cpu); for (i = 1; i < 4; i++) { unsigned long long val = rdmsr(cpu, PERFCTR0 + i); if (absolute) printf_padded(16, "%Lu", val); else printf_padded(16, "%Lu", val - lastval[i]); lastval[i] = val; } putchar('\n'); } static void dumpall(void) { int cnt = 0; int cpu; print_header(); for (;;) { for (cpu = 0; cpu < MAXCPU; ++cpu) { if (msrfd[cpu] < 0) continue; print_cpu(cpu); } if (!delay) break; sleep(delay); if (++cnt > 40) { cnt = 0; print_header(); } } } static void checkk8(void) { char *line = NULL; size_t size = 0; int bad = 0; FILE *f = fopen("/proc/cpuinfo", "r"); if (!f) return; while (getline(&line, &size, f) > 0) { if (!strncmp("vendor_id", line, 9)) { if (!strstr(line, "AMD")) bad++; } if (!strncmp("cpu family", line, 10)) { char *s = line + strcspn(line,":"); int family; if (*s == ':') ++s; family = strtoul(s, NULL, 0); if (family != 15) bad++; } } if (bad) { printf("not an opteron cpu\n"); exit(1); } free(line); fclose(f); } static void usage(void) { fprintf(stderr, "usage: numamon [args] [delay]\n"); fprintf(stderr, " -f forcibly overwrite counters\n"); fprintf(stderr, " -i count IO (default memory)\n"); fprintf(stderr, " -a print absolute counter values (with delay)\n"); fprintf(stderr, " -s setup counters and exit\n"); fprintf(stderr, " -c clear counters and exit\n"); fprintf(stderr, " -m Print memory traffic (default)\n"); fprintf(stderr, " -C cpu{,cpu} only print for cpus\n"); fprintf(stderr, " -v Be verbose\n"); exit(1); } int main(int ac, char **av) { int opt; checkk8(); while ((opt = getopt(ac,av,"ifscmaC:v")) != -1) { switch (opt) { case 'f': force = 1; break; case 'c': setup(1); exit(0); case 's': setup(0); exit(0); case 'm': masks = mem; break; case 'i': masks = io; break; case 'a': absolute = 1; break; case 'C': cfilter = optarg; break; case 'v': verbose = 1; break; default: usage(); } } if (av[optind]) { char *end; delay = strtoul(av[optind], &end, 10); if (*end) usage(); if (av[optind+1]) usage(); } setup(0); dumpall(); return 0; } 0707010000002E000081A40000000000000000000000016319106A0000194A000000000000000000000000000000000000002500000000numactl- "numastat" "8" "1.0.0" "Bill Gray" "Administration" .SH NAME .LP \fBnumastat\fP \- Show per-NUMA-node memory statistics for processes and the operating system .SH "SYNTAX" .LP \fBnumastat\fP .br .LP \fBnumastat\fP [\fI\-V\fP] .br .LP \fBnumastat\fP [\fI\<PID>|<pattern>...\fP] .br .LP \fBnumastat\fP [\fI\-c\fP] [\fI\-m\fP] [\fI\-n\fP] [\fI\-p <PID>|<pattern>\fP] [\fI\-s[<node>]\fP] [\fI\-v\fP] [\fI\-z\fP] [\fI\<PID>|<pattern>...\fP] .br .SH "DESCRIPTION" .LP .B numastat with no command options or arguments at all, displays per-node NUMA hit and miss system statistics from the kernel memory allocator. This default \fBnumastat\fP behavior is strictly compatible with the previous long-standing \fBnumastat\fP perl script, written by Andi Kleen. The default \fBnumastat\fP statistics shows per-node numbers (in units of pages of memory) in these categories: .LP .B numa_hit is memory successfully allocated on this node as intended. .LP .B numa_miss is memory allocated on this node despite the process preferring some different node. Each .I numa_miss has a .I numa_foreign on another node. .LP .B numa_foreign is memory intended for this node, but actually allocated on some different node. Each .I numa_foreign has a .I numa_miss on another node. .LP .B interleave_hit is interleaved memory successfully allocated on this node as intended. .LP .B local_node is memory allocated on this node while a process was running on it. .LP .B other_node is memory allocated on this node while a process was running on some other node. .LP Any supplied options or arguments with the \fBnumastat\fP command will significantly change both the content and the format of the display. Specified options will cause display units to change to megabytes of memory, and will change other specific behaviors of \fBnumastat\fP as described below. .LP Memory usage information reflects the resident pages on the system. .SH "OPTIONS" .LP .TP \fB\-c\fR Minimize table display width by dynamically shrinking column widths based on data contents. With this option, amounts of memory will be rounded to the nearest megabyte (rather than the usual display with two decimal places). Column width and inter-column spacing will be somewhat unpredictable with this option, but the more dense display will be very useful on systems with many NUMA nodes. .TP \fB\-m\fR Show the meminfo-like system-wide memory usage information. This option produces a per-node breakdown of memory usage information similar to that found in /proc/meminfo. .TP \fB\-n\fR Show the original \fBnumastat\fP statistics info. This will show the same information as the default \fBnumastat\fP behavior but the units will be megabytes of memory, and there will be other formatting and layout changes versus the original \fBnumastat\fP behavior. .TP \fB\-p\fR <\fBPID\fP> or <\fBpattern\fP> Show per-node memory allocation information for the specified PID or pattern. If the \-p argument is only digits, it is assumed to be a numerical PID. If the argument characters are not only digits, it is assumed to be a text fragment pattern to search for in process command lines. For example, \fBnumastat -p qemu\fP will attempt to find and show information for processes with "qemu" in the command line. Any command line arguments remaining after \fBnumastat\fP option flag processing is completed, are assumed to be additional <\fBPID\fP> or <\fBpattern\fP> process specifiers. In this sense, the \fB\-p\fP option flag is optional: \fBnumastat qemu\fP is equivalent to \fBnumastat -p qemu\fP .TP \fB\-s[<node>]\fR Sort the table data in descending order before displaying it, so the biggest memory consumers are listed first. With no specified <node>, the table will be sorted by the total column. If the optional <node> argument is supplied, the data will be sorted by the <node> column. Note that <node> must follow the \fB\-s\fP immediately with no intermediate white space (e.g., \fBnumastat \-s2\fP). Because \fB\-s\fP can allow an optional argument, it must always be the last option character in a compound option character string. For example, instead of \fBnumastat \-msc\fP (which probably will not work as you expect), use \fBnumastat \-mcs\fP .TP \fB\-v\fR Make some reports more verbose. In particular, process information for multiple processes will display detailed information for each process. Normally when per-node information for multiple processes is displayed, only the total lines are shown. .TP \fB\-V\fR Display \fBnumastat\fP version information and exit. .TP \fB\-z\fR Skip display of table rows and columns of only zero valuess. This can be used to greatly reduce the amount of uninteresting zero data on systems with many NUMA nodes. Note that when rows or columns of zeros are still displayed with this option, that probably means there is at least one value in the row or column that is actually non-zero, but rounded to zero for display. .SH NOTES \fBnumastat\fP attempts to fold each table display so it will be conveniently readable on the output terminal. Normally a terminal width of 80 characters is assumed. When the \fBresize\fP command is available, \fBnumastat\fP attempts to dynamically determine and fine tune the output tty width from \fBresize\fP output. If \fBnumastat\fP output is not to a tty, very long output lines can be produced, depending on how many NUMA nodes are present. In all cases, output width can be explicitly specified via the \fBNUMASTAT_WIDTH\fP environment variable. For example, \fBNUMASTAT_WIDTH=100 numastat\fP. On systems with many NUMA nodes, \fBnumastat \-c \-z ....\fP can be very helpful to selectively reduce the amount of displayed information. .SH "ENVIRONMENT VARIABLES" .LP .TP NUMASTAT_WIDTH .SH "FILES" .LP \fI/proc/*/numa_maps\fP .br \fI/sys/devices/system/node/node*/meminfo\fP .br \fI/sys/devices/system/node/node*/numastat\fP .SH "EXAMPLES" .I numastat \-c \-z \-m \-n .br .I numastat \-czs libvirt kvm qemu .br .I watch \-n1 numastat .br .I watch \-n1 \-\-differences=cumulative numastat .SH "AUTHORS" .LP The original \fBnumastat\fP perl script was written circa 2003 by Andi Kleen <>. The current \fBnumastat\fP program was written in 2012 by Bill Gray <> to be compatible by default with the original, and to add options to display per-node system memory usage and per-node process memory allocation. .SH "SEE ALSO" .LP .BR numactl (8), .BR set_mempolicy( 2), .BR numa (3) 0707010000002F000081A40000000000000000000000016319106A0000F2DC000000000000000000000000000000000000002500000000numactl-* numastat - NUMA monitoring tool to show per-node usage of memory Copyright (C) 2012 Bill Gray (, Red Hat Inc numastat is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. numastat is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Historical note: From approximately 2003 to 2012, numastat was a perl script written by Andi Kleen to display the /sys/devices/system/node/node<N>/numastat statistics. In 2012, numastat was rewritten as a C program by Red Hat to display per-node memory data for applications and the system in general, while also remaining strictly compatible by default with the original numastat. A copy of the original numastat perl script is included for reference at the end of this file. */ // Compile with: gcc -O -std=gnu99 -Wall -o numastat numastat.c #define __USE_MISC #include <ctype.h> #include <dirent.h> #include <getopt.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <unistd.h> #define STRINGIZE(s) #s #define STRINGIFY(s) STRINGIZE(s) #define KILOBYTE (1024) #define MEGABYTE (1024 * 1024) #define BUF_SIZE 2048 #define SMALL_BUF_SIZE 128 // Don't assume nodes are sequential or contiguous. // Need to discover and map node numbers. int *node_ix_map = NULL; char **node_header; // Structure to organize memory info from /proc/<PID>/numa_maps for a specific // process, or from /sys/devices/system/node/node?/meminfo for system-wide // data. Tables are defined below for each process and for system-wide data. typedef struct meminfo { int index; char *token; char *label; } meminfo_t, *meminfo_p; #define PROCESS_HUGE_INDEX 0 #define PROCESS_PRIVATE_INDEX 3 static meminfo_t process_meminfo[] = { { PROCESS_HUGE_INDEX, "huge", "Huge" }, { 1, "heap", "Heap" }, { 2, "stack", "Stack" }, { PROCESS_PRIVATE_INDEX, "N", "Private" } }; #define PROCESS_MEMINFO_ROWS (sizeof(process_meminfo) / sizeof(process_meminfo[0])) static meminfo_t numastat_meminfo[] = { { 0, "numa_hit", "Numa_Hit" }, { 1, "numa_miss", "Numa_Miss" }, { 2, "numa_foreign", "Numa_Foreign" }, { 3, "interleave_hit", "Interleave_Hit" }, { 4, "local_node", "Local_Node" }, { 5, "other_node", "Other_Node" }, }; #define NUMASTAT_MEMINFO_ROWS (sizeof(numastat_meminfo) / sizeof(numastat_meminfo[0])) static meminfo_t system_meminfo[] = { { 0, "MemTotal", "MemTotal" }, { 1, "MemFree", "MemFree" }, { 2, "MemUsed", "MemUsed" }, { 3, "HighTotal", "HighTotal" }, { 4, "HighFree", "HighFree" }, { 5, "LowTotal", "LowTotal" }, { 6, "LowFree", "LowFree" }, { 7, "Active", "Active" }, { 8, "Inactive", "Inactive" }, { 9, "Active(anon)", "Active(anon)" }, { 10, "Inactive(anon)", "Inactive(anon)" }, { 11, "Active(file)", "Active(file)" }, { 12, "Inactive(file)", "Inactive(file)" }, { 13, "Unevictable", "Unevictable" }, { 14, "Mlocked", "Mlocked" }, { 15, "Dirty", "Dirty" }, { 16, "Writeback", "Writeback" }, { 17, "FilePages", "FilePages" }, { 18, "Mapped", "Mapped" }, { 19, "AnonPages", "AnonPages" }, { 20, "Shmem", "Shmem" }, { 21, "KernelStack", "KernelStack" }, { 22, "PageTables", "PageTables" }, { 23, "NFS_Unstable", "NFS_Unstable" }, { 24, "Bounce", "Bounce" }, { 25, "WritebackTmp", "WritebackTmp" }, { 26, "Slab", "Slab" }, { 27, "SReclaimable", "SReclaimable" }, { 28, "SUnreclaim", "SUnreclaim" }, { 29, "AnonHugePages", "AnonHugePages" }, { 30, "ShmemHugePages", "ShmemHugePages" }, { 31, "ShmemPmdMapped", "ShmemPmdMapped" }, { 32, "HugePages_Total", "HugePages_Total" }, { 33, "HugePages_Free", "HugePages_Free" }, { 34, "HugePages_Surp", "HugePages_Surp" }, { 35, "KReclaimable", "KReclaimable" } }; #define SYSTEM_MEMINFO_ROWS (sizeof(system_meminfo) / sizeof(system_meminfo[0])) // To allow re-ordering the meminfo memory categories in system_meminfo and // numastat_meminfo relative to order in /proc, etc., a simple hash index is // used to look up the meminfo categories. The allocated hash table size must // be bigger than necessary to reduce collisions (and because these specific // hash algorithms depend on having some unused buckets. #define HASH_TABLE_SIZE 151 static int hash_collisions = 0; struct hash_entry { char *name; int index; } hash_table[HASH_TABLE_SIZE]; static void init_hash_table(void) { memset(hash_table, 0, sizeof(hash_table)); } static int hash_ix(char *s) { unsigned int h = 17; while (*s) { // h * 33 + *s++ h = ((h << 5) + h) + *s++; } return (h % HASH_TABLE_SIZE); } static int hash_lookup(char *s) { int ix = hash_ix(s); while (hash_table[ix].name) { // Assumes big table with blank entries if (!strcmp(s, hash_table[ix].name)) { return hash_table[ix].index; // found it } ix += 1; if (ix >= HASH_TABLE_SIZE) { ix = 0; } } return -1; } static int hash_insert(char *s, int i) { int ix = hash_ix(s); while (hash_table[ix].name) { // assumes no duplicate entries hash_collisions += 1; ix += 1; if (ix >= HASH_TABLE_SIZE) { ix = 0; } } hash_table[ix].name = s; hash_table[ix].index = i; return ix; } // To decouple details of table display (e.g. column width, line folding for // display screen width, et cetera) from acquiring the data and populating the // tables, this semi-general table handling code is used. There are various // routines to set table attributes, assign and test some cell contents, // initialize and actually display the table. #define CELL_TYPE_NULL 0 #define CELL_TYPE_LONG 1 #define CELL_TYPE_DOUBLE 2 #define CELL_TYPE_STRING 3 #define CELL_TYPE_CHAR8 4 #define CELL_TYPE_REPCHAR 5 #define CELL_FLAG_FREEABLE (1 << 0) #define CELL_FLAG_ROWSPAN (1 << 1) #define CELL_FLAG_COLSPAN (1 << 2) #define COL_JUSTIFY_LEFT (1 << 0) #define COL_JUSTIFY_RIGHT (1 << 1) #define COL_JUSTIFY_CENTER 3 #define COL_JUSTIFY_MASK 0x3 #define COL_FLAG_SEEN_DATA (1 << 2) #define COL_FLAG_NON_ZERO_DATA (1 << 3) #define COL_FLAG_ALWAYS_SHOW (1 << 4) #define ROW_FLAG_SEEN_DATA COL_FLAG_SEEN_DATA #define ROW_FLAG_NON_ZERO_DATA COL_FLAG_NON_ZERO_DATA #define ROW_FLAG_ALWAYS_SHOW COL_FLAG_ALWAYS_SHOW typedef struct cell { uint32_t type; uint32_t flags; union { char *s; double d; int64_t l; char c[8]; }; } cell_t, *cell_p; typedef struct vtab { int header_rows; int header_cols; int data_rows; int data_cols; cell_p cell; int *row_ix_map; uint8_t *row_flags; uint8_t *col_flags; uint8_t *col_width; uint8_t *col_decimal_places; } vtab_t, *vtab_p; #define ALL_TABLE_ROWS (table->header_rows + table->data_rows) #define ALL_TABLE_COLS (table->header_cols + table->data_cols) #define GET_CELL_PTR(row, col) (&table->cell[(row * ALL_TABLE_COLS) + col]) #define USUAL_GUTTER_WIDTH 1 static inline void set_row_flag(vtab_p table, int row, int flag) { table->row_flags[row] |= (uint8_t)flag; } static inline void set_col_flag(vtab_p table, int col, int flag) { table->col_flags[col] |= (uint8_t)flag; } static inline void clear_row_flag(vtab_p table, int row, int flag) { table->row_flags[row] &= (uint8_t)~flag; } static inline void clear_col_flag(vtab_p table, int col, int flag) { table->col_flags[col] &= (uint8_t)~flag; } static inline int test_row_flag(vtab_p table, int row, int flag) { return ((table->row_flags[row] & (uint8_t)flag) != 0); } static inline int test_col_flag(vtab_p table, int col, int flag) { return ((table->col_flags[col] & (uint8_t)flag) != 0); } static inline void set_col_justification(vtab_p table, int col, int justify) { table->col_flags[col] &= (uint8_t)~COL_JUSTIFY_MASK; table->col_flags[col] |= (uint8_t)(justify & COL_JUSTIFY_MASK); } static inline void set_col_width(vtab_p table, int col, uint8_t width) { if (width >= SMALL_BUF_SIZE) { width = SMALL_BUF_SIZE - 1; } table->col_width[col] = width; } static inline void set_col_decimal_places(vtab_p table, int col, uint8_t places) { table->col_decimal_places[col] = places; } static inline void set_cell_flag(vtab_p table, int row, int col, int flag) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->flags |= (uint32_t)flag; } static inline void clear_cell_flag(vtab_p table, int row, int col, int flag) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->flags &= (uint32_t)~flag; } static inline int test_cell_flag(vtab_p table, int row, int col, int flag) { cell_p c_ptr = GET_CELL_PTR(row, col); return ((c_ptr->flags & (uint32_t)flag) != 0); } static inline void string_assign(vtab_p table, int row, int col, char *s) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_STRING; c_ptr->s = s; } static inline void repchar_assign(vtab_p table, int row, int col, char c) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_REPCHAR; c_ptr->c[0] = c; } static inline void double_assign(vtab_p table, int row, int col, double d) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_DOUBLE; c_ptr->d = d; } static inline void long_assign(vtab_p table, int row, int col, int64_t l) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_LONG; c_ptr->l = l; } static inline void double_addto(vtab_p table, int row, int col, double d) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_DOUBLE; c_ptr->d += d; } static inline void long_addto(vtab_p table, int row, int col, int64_t l) { cell_p c_ptr = GET_CELL_PTR(row, col); c_ptr->type = CELL_TYPE_LONG; c_ptr->l += l; } static inline void clear_assign(vtab_p table, int row, int col) { cell_p c_ptr = GET_CELL_PTR(row, col); memset(c_ptr, 0, sizeof(cell_t)); } static void zero_table_data(vtab_p table, int type) { // Sets data area of table to zeros of specified type for (int row = table->header_rows; (row < ALL_TABLE_ROWS); row++) { for (int col = table->header_cols; (col < ALL_TABLE_COLS); col++) { cell_p c_ptr = GET_CELL_PTR(row, col); memset(c_ptr, 0, sizeof(cell_t)); c_ptr->type = type; } } } static void sort_rows_descending_by_col(vtab_p table, int start_row, int stop_row, int col) { // Rearrange row_ix_map[] indices so the rows will be in // descending order by the value in the specified column for (int ix = start_row; (ix <= stop_row); ix++) { int biggest_ix = ix; cell_p biggest_ix_c_ptr = GET_CELL_PTR(table->row_ix_map[ix], col); for (int iy = ix + 1; (iy <= stop_row); iy++) { cell_p iy_c_ptr = GET_CELL_PTR(table->row_ix_map[iy], col); if (biggest_ix_c_ptr->d < iy_c_ptr->d) { biggest_ix_c_ptr = iy_c_ptr; biggest_ix = iy; } } if (biggest_ix != ix) { int tmp = table->row_ix_map[ix]; table->row_ix_map[ix] = table->row_ix_map[biggest_ix]; table->row_ix_map[biggest_ix] = tmp; } } } static void init_table(vtab_p table, int header_rows, int header_cols, int data_rows, int data_cols) { // init table sizes table->header_rows = header_rows; table->header_cols = header_cols; table->data_rows = data_rows; table->data_cols = data_cols; // allocate memory for all the cells int alloc_size = ALL_TABLE_ROWS * ALL_TABLE_COLS * sizeof(cell_t); table->cell = malloc(alloc_size); if (table->cell == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } memset(table->cell, 0, alloc_size); // allocate memory for the row map vector alloc_size = ALL_TABLE_ROWS * sizeof(int); table->row_ix_map = malloc(alloc_size); if (table->row_ix_map == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } for (int row = 0; (row < ALL_TABLE_ROWS); row++) { table->row_ix_map[row] = row; } // allocate memory for the row flags vector alloc_size = ALL_TABLE_ROWS * sizeof(uint8_t); table->row_flags = malloc(alloc_size); if (table->row_flags == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } memset(table->row_flags, 0, alloc_size); // allocate memory for the column flags vector alloc_size = ALL_TABLE_COLS * sizeof(uint8_t); table->col_flags = malloc(alloc_size); if (table->col_flags == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } memset(table->col_flags, 0, alloc_size); // allocate memory for the column width vector alloc_size = ALL_TABLE_COLS * sizeof(uint8_t); table->col_width = malloc(alloc_size); if (table->col_width == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } memset(table->col_width, 0, alloc_size); // allocate memory for the column precision vector alloc_size = ALL_TABLE_COLS * sizeof(uint8_t); table->col_decimal_places = malloc(alloc_size); if (table->col_decimal_places == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } memset(table->col_decimal_places, 0, alloc_size); } static void free_cell(vtab_p table, int row, int col) { cell_p c_ptr = GET_CELL_PTR(row, col); if ((c_ptr->type == CELL_TYPE_STRING) && (c_ptr->flags & CELL_FLAG_FREEABLE) && (c_ptr->s != NULL)) { free(c_ptr->s); } memset(c_ptr, 0, sizeof(cell_t)); } static void free_table(vtab_p table) { if (table->cell != NULL) { for (int row = 0; (row < ALL_TABLE_ROWS); row++) { for (int col = 0; (col < ALL_TABLE_COLS); col++) { free_cell(table, row, col); } } free(table->cell); } if (table->row_ix_map != NULL) { free(table->row_ix_map); } if (table->row_flags != NULL) { free(table->row_flags); } if (table->col_flags != NULL) { free(table->col_flags); } if (table->col_width != NULL) { free(table->col_width); } if (table->col_decimal_places != NULL) { free(table->col_decimal_places); } } static char *fmt_cell_data(cell_p c_ptr, int max_width, int decimal_places) { // Returns pointer to a static buffer, expecting caller to // immediately use or copy the contents before calling again. int rep_width = max_width - USUAL_GUTTER_WIDTH; static char buf[SMALL_BUF_SIZE]; switch (c_ptr->type) { case CELL_TYPE_NULL: buf[0] = '\0'; break; case CELL_TYPE_LONG: snprintf(buf, SMALL_BUF_SIZE, "%ld", c_ptr->l); break; case CELL_TYPE_DOUBLE: snprintf(buf, SMALL_BUF_SIZE, "%.*f", decimal_places, c_ptr->d); break; case CELL_TYPE_STRING: snprintf(buf, SMALL_BUF_SIZE, "%s", c_ptr->s); break; case CELL_TYPE_CHAR8: strncpy(buf, c_ptr->c, 8); buf[8] = '\0'; break; case CELL_TYPE_REPCHAR: memset(buf, c_ptr->c[0], rep_width); buf[rep_width] = '\0'; break; default: strcpy(buf, "Unknown"); break; } buf[max_width] = '\0'; return buf; } static void auto_set_col_width(vtab_p table, int col, int min_width, int max_width) { int width = min_width; for (int row = 0; (row < ALL_TABLE_ROWS); row++) { cell_p c_ptr = GET_CELL_PTR(row, col); if (c_ptr->type == CELL_TYPE_REPCHAR) { continue; } char *p = fmt_cell_data(c_ptr, max_width, (int)(table->col_decimal_places[col])); int l = strlen(p); if (width < l) { width = l; } } width += USUAL_GUTTER_WIDTH; if (width > max_width) { width = max_width; } table->col_width[col] = (uint8_t)width; } static void display_justified_cell(cell_p c_ptr, int row_flags, int col_flags, int width, int decimal_places) { char *p = fmt_cell_data(c_ptr, width, decimal_places); int l = strlen(p); char buf[SMALL_BUF_SIZE]; switch (col_flags & COL_JUSTIFY_MASK) { case COL_JUSTIFY_LEFT: memcpy(buf, p, l); if (l < width) { memset(&buf[l], ' ', width - l); } break; case COL_JUSTIFY_RIGHT: if (l < width) { memset(buf, ' ', width - l); } memcpy(&buf[width - l], p, l); break; case COL_JUSTIFY_CENTER: default: memset(buf, ' ', width); memcpy(&buf[(width - l + 1) / 2], p, l); break; } buf[width] = '\0'; printf("%s", buf); } static void display_table(vtab_p table, int screen_width, int show_unseen_rows, int show_unseen_cols, int show_zero_rows, int show_zero_cols) { // Set row and column flags according to whether data in rows and cols // has been assigned, and is currently non-zero. int some_seen_data = 0; int some_non_zero_data = 0; for (int row = table->header_rows; (row < ALL_TABLE_ROWS); row++) { for (int col = table->header_cols; (col < ALL_TABLE_COLS); col++) { cell_p c_ptr = GET_CELL_PTR(row, col); // Currently, "seen data" includes not only numeric data, but also // any strings, etc -- anything non-NULL (other than rephcars). if ((c_ptr->type != CELL_TYPE_NULL) && (c_ptr->type != CELL_TYPE_REPCHAR)) { some_seen_data = 1; set_row_flag(table, row, ROW_FLAG_SEEN_DATA); set_col_flag(table, col, COL_FLAG_SEEN_DATA); // Currently, "non-zero data" includes not only numeric data, // but also any strings, etc -- anything non-zero (other than // repchars, which are already excluded above). So, note a // valid non-NULL pointer to an empty string would still be // counted as non-zero data. if (c_ptr->l != (int64_t)0) { some_non_zero_data = 1; set_row_flag(table, row, ROW_FLAG_NON_ZERO_DATA); set_col_flag(table, col, COL_FLAG_NON_ZERO_DATA); } } } } if (!some_seen_data) { printf("Table has no data.\n"); return; } if (!some_non_zero_data && !show_zero_rows && !show_zero_cols) { printf("Table has no non-zero data.\n"); return; } // Start with first data column and try to display table, // folding lines as necessary per screen_width int col = -1; int data_col = table->header_cols; while (data_col < ALL_TABLE_COLS) { // Skip data columns until we have one to display if ((!test_col_flag(table, data_col, COL_FLAG_ALWAYS_SHOW)) && (((!show_unseen_cols) && (!test_col_flag(table, data_col, COL_FLAG_SEEN_DATA))) || ((!show_zero_cols) && (!test_col_flag(table, data_col, COL_FLAG_NON_ZERO_DATA))))) { data_col += 1; continue; } // Display blank line between table sections if (col > 0) { printf("\n"); } // For each row, display as many columns as possible for (int row_ix = 0; (row_ix < ALL_TABLE_ROWS); row_ix++) { int row = table->row_ix_map[row_ix]; // If past the header rows, conditionally skip rows if ((row >= table->header_rows) && (!test_row_flag(table, row, ROW_FLAG_ALWAYS_SHOW))) { // Optionally skip row if no data seen or if all zeros if (((!show_unseen_rows) && (!test_row_flag(table, row, ROW_FLAG_SEEN_DATA))) || ((!show_zero_rows) && (!test_row_flag(table, row, ROW_FLAG_NON_ZERO_DATA)))) { continue; } } // Begin a new row... int cur_line_width = 0; // All lines start with the left header columns for (col = 0; (col < table->header_cols); col++) { display_justified_cell(GET_CELL_PTR(row, col), (int)(table->row_flags[row]), (int)(table->col_flags[col]), (int)(table->col_width[col]), (int)(table->col_decimal_places[col])); cur_line_width += (int)(table->col_width[col]); } // Reset column index to starting data column for each new row col = data_col; // Try to display as many data columns as possible in every section for (;;) { // See if we should print this column if (test_col_flag(table, col, COL_FLAG_ALWAYS_SHOW) || (((show_unseen_cols) || (test_col_flag(table, col, COL_FLAG_SEEN_DATA))) && ((show_zero_cols) || (test_col_flag(table, col, COL_FLAG_NON_ZERO_DATA))))) { display_justified_cell(GET_CELL_PTR(row, col), (int)(table->row_flags[row]), (int)(table->col_flags[col]), (int)(table->col_width[col]), (int)(table->col_decimal_places[col])); cur_line_width += (int)(table->col_width[col]); } col += 1; // End the line if no more columns or next column would exceed screen width if ((col >= ALL_TABLE_COLS) || ((cur_line_width + (int)(table->col_width[col])) > screen_width)) { break; } } printf("\n"); } // Remember next starting data column for next section data_col = col; } } static int verbose = 0; static int num_pids = 0; static int num_nodes = 0; static int screen_width = 0; static int show_zero_data = 1; static int compress_display = 0; static int sort_table = 0; static int sort_table_node = -1; static int compatibility_mode = 0; static int pid_array_max_pids = 0; static int *pid_array = NULL; static char *prog_name = NULL; static double page_size_in_bytes = 0; static double huge_page_size_in_bytes = 0; static void display_version_and_exit(void) { char *version_string = "20130723"; printf("%s version: %s: %s\n", prog_name, version_string, __DATE__); exit(EXIT_SUCCESS); } static void display_usage_and_exit(void) { fprintf(stderr, "Usage: %s [-c] [-m] [-n] [-p <PID>|<pattern>] [-s[<node>]] [-v] [-V] [-z] [ <PID>|<pattern>... ]\n", prog_name); fprintf(stderr, "-c to minimize column widths\n"); fprintf(stderr, "-m to show meminfo-like system-wide memory usage\n"); fprintf(stderr, "-n to show the numastat statistics info\n"); fprintf(stderr, "-p <PID>|<pattern> to show process info\n"); fprintf(stderr, "-s[<node>] to sort data by total column or <node>\n"); fprintf(stderr, "-v to make some reports more verbose\n"); fprintf(stderr, "-V to show the %s code version\n", prog_name); fprintf(stderr, "-z to skip rows and columns of zeros\n"); exit(EXIT_FAILURE); } static int get_screen_width(void) { int width = 80; char *p = getenv("NUMASTAT_WIDTH"); if (p != NULL) { width = atoi(p); if ((width < 1) || (width > 10000000)) { width = 80; } } else if (isatty(fileno(stdout))) { FILE *fs = popen("resize 2>/dev/null", "r"); if (fs != NULL) { char buf[72]; char *columns; columns = fgets(buf, sizeof(columns), fs); pclose(fs); if (columns && strncmp(columns, "COLUMNS=", 8) == 0) { width = atoi(&columns[8]); if ((width < 1) || (width > 10000000)) { width = 80; } } } } else { // Not a tty, so allow a really long line width = 10000000; } if (width < 32) { width = 32; } return width; } static char *command_name_for_pid(int pid) { // Get the PID command name field from /proc/PID/status file. Return // pointer to a static buffer, expecting caller to immediately copy result. static char buf[SMALL_BUF_SIZE]; char fname[64]; snprintf(fname, sizeof(fname), "/proc/%d/status", pid); FILE *fs = fopen(fname, "r"); if (!fs) { return NULL; } else { while (fgets(buf, SMALL_BUF_SIZE, fs)) { if (strstr(buf, "Name:") == buf) { char *p = &buf[5]; while (isspace(*p)) { p++; } if (p[strlen(p) - 1] == '\n') { p[strlen(p) - 1] = '\0'; } fclose(fs); return p; } } fclose(fs); } return NULL; } static void show_info_from_system_file(char *file, meminfo_p meminfo, int meminfo_rows, int tok_offset) { // Setup and init table vtab_t table; int header_rows = 2 - compatibility_mode; int header_cols = 1; // Add an extra data column for a total column init_table(&table, header_rows, header_cols, meminfo_rows, num_nodes + 1); int total_col_ix = header_cols + num_nodes; // Insert token mapping in hash table and assign left header column label for each row in table init_hash_table(); for (int row = 0; (row < meminfo_rows); row++) { hash_insert(meminfo[row].token, meminfo[row].index); if (compatibility_mode) { string_assign(&table, (header_rows + row), 0, meminfo[row].token); } else { string_assign(&table, (header_rows + row), 0, meminfo[row].label); } } // printf("There are %d table hash collisions.\n", hash_collisions); // Set left header column width and left justify it set_col_width(&table, 0, 16); set_col_justification(&table, 0, COL_JUSTIFY_LEFT); // Open /sys/devices/system/node/node?/<file> for each node and store data // in table. If not compatibility_mode, do approximately first third of // this loop also for (node_ix == num_nodes) to get "Total" column header. for (int node_ix = 0; (node_ix < (num_nodes + (1 - compatibility_mode))); node_ix++) { int col = header_cols + node_ix; // Assign header row label and horizontal line for this column... string_assign(&table, 0, col, node_header[node_ix]); if (!compatibility_mode) { repchar_assign(&table, 1, col, '-'); int decimal_places = 2; if (compress_display) { decimal_places = 0; } set_col_decimal_places(&table, col, decimal_places); } // Set column width and right justify data set_col_width(&table, col, 16); set_col_justification(&table, col, COL_JUSTIFY_RIGHT); if (node_ix == num_nodes) { break; } // Open /sys/.../node<N>/numstast file for this node... char buf[SMALL_BUF_SIZE]; char fname[64]; snprintf(fname, sizeof(fname), "/sys/devices/system/node/node%d/%s", node_ix_map[node_ix], file); FILE *fs = fopen(fname, "r"); if (!fs) { sprintf(buf, "cannot open %s", fname); perror(buf); exit(EXIT_FAILURE); } // Get table values for this node... while (fgets(buf, SMALL_BUF_SIZE, fs)) { char *tok[64]; int tokens = 0; const char *delimiters = " \t\r\n:"; char *p = strtok(buf, delimiters); if (p == NULL) { continue; // Skip blank lines; } while (p) { tok[tokens++] = p; p = strtok(NULL, delimiters); } // example line from numastat file: "numa_miss 16463" // example line from meminfo file: "Node 3 Inactive: 210680 kB" int index = hash_lookup(tok[0 + tok_offset]); if (index < 0) { printf("Token %s not in hash table.\n", tok[0 + tok_offset]); } else { double value = (double)atol(tok[1 + tok_offset]); if (!compatibility_mode) { double multiplier = 1.0; if (tokens < 4) { multiplier = page_size_in_bytes; } else if (!strncmp("HugePages", tok[2], 9)) { multiplier = huge_page_size_in_bytes; } else if (!strncmp("kB", tok[4], 2)) { multiplier = KILOBYTE; } value *= multiplier; value /= (double)MEGABYTE; } double_assign(&table, header_rows + index, col, value); double_addto(&table, header_rows + index, total_col_ix, value); } } fclose(fs); } // Crompress display column widths, if requested if (compress_display) { for (int col = 0; (col < header_cols + num_nodes + 1); col++) { auto_set_col_width(&table, col, 4, 16); } } // Optionally sort the table data if (sort_table) { int sort_col; if ((sort_table_node < 0) || (sort_table_node >= num_nodes)) { sort_col = total_col_ix; } else { sort_col = header_cols + node_ix_map[sort_table_node]; } sort_rows_descending_by_col(&table, header_rows, header_rows + meminfo_rows - 1, sort_col); } // Actually display the table now, doing line-folding as necessary display_table(&table, screen_width, 0, 0, show_zero_data, show_zero_data); free_table(&table); } static void show_numastat_info(void) { if (!compatibility_mode) { printf("\nPer-node numastat info (in MBs):\n"); } show_info_from_system_file("numastat", numastat_meminfo, NUMASTAT_MEMINFO_ROWS, 0); } static void show_system_info(void) { printf("\nPer-node system memory usage (in MBs):\n"); show_info_from_system_file("meminfo", system_meminfo, SYSTEM_MEMINFO_ROWS, 2); } static void show_process_info(void) { vtab_t table; int header_rows = 2; int header_cols = 1; int data_rows; int show_sub_categories = (verbose || (num_pids == 1)); if (show_sub_categories) { data_rows = PROCESS_MEMINFO_ROWS; } else { data_rows = num_pids; } // Add two extra rows for a horizontal rule followed by a total row // Add one extra data column for a total column init_table(&table, header_rows, header_cols, data_rows + 2, num_nodes + 1); int total_col_ix = header_cols + num_nodes; int total_row_ix = header_rows + data_rows + 1; string_assign(&table, total_row_ix, 0, "Total"); if (show_sub_categories) { // Assign left header column label for each row in table for (int row = 0; (row < PROCESS_MEMINFO_ROWS); row++) { string_assign(&table, (header_rows + row), 0, process_meminfo[row].label); } } else { string_assign(&table, 0, 0, "PID"); repchar_assign(&table, 1, 0, '-'); printf("\nPer-node process memory usage (in MBs)\n"); } // Set left header column width and left justify it set_col_width(&table, 0, 16); set_col_justification(&table, 0, COL_JUSTIFY_LEFT); // Set up "Node <N>" column headers over data columns, plus "Total" column for (int node_ix = 0; (node_ix <= num_nodes); node_ix++) { int col = header_cols + node_ix; // Assign header row label and horizontal line for this column... string_assign(&table, 0, col, node_header[node_ix]); repchar_assign(&table, 1, col, '-'); // Set column width, decimal places, and right justify data set_col_width(&table, col, 16); int decimal_places = 2; if (compress_display) { decimal_places = 0; } set_col_decimal_places(&table, col, decimal_places); set_col_justification(&table, col, COL_JUSTIFY_RIGHT); } // Initialize data in table to all zeros zero_table_data(&table, CELL_TYPE_DOUBLE); // If (show_sub_categories), show individual process tables for each PID, // Otherwise show one big table of process total lines from all the PIDs. for (int pid_ix = 0; (pid_ix < num_pids); pid_ix++) { int pid = pid_array[pid_ix]; if (show_sub_categories) { printf("\nPer-node process memory usage (in MBs) for PID %d (%s)\n", pid, command_name_for_pid(pid)); if (pid_ix > 0) { // Re-initialize show_sub_categories table, because we re-use it for each PID. zero_table_data(&table, CELL_TYPE_DOUBLE); } } else { // Put this row's "PID (cmd)" label in left header column for this PID total row char tmp_buf[64]; snprintf(tmp_buf, sizeof(tmp_buf), "%d (%s)", pid, command_name_for_pid(pid)); char *p = strdup(tmp_buf); if (p == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } string_assign(&table, header_rows + pid_ix, 0, p); set_cell_flag(&table, header_rows + pid_ix, 0, CELL_FLAG_FREEABLE); } // Open numa_map for this PID to get per-node data char fname[64]; snprintf(fname, sizeof(fname), "/proc/%d/numa_maps", pid); char buf[BUF_SIZE]; FILE *fs = fopen(fname, "r"); if (!fs) { sprintf(buf, "Can't read /proc/%d/numa_maps", pid); perror(buf); continue; } // Add up sub-category memory used from each node. Must go line by line // through the numa_map figuring out which category memory, node, and the // amount. while (fgets(buf, BUF_SIZE, fs)) { int category = PROCESS_PRIVATE_INDEX; // init category to the catch-all... const char *delimiters = " \t\r\n"; char *p = strtok(buf, delimiters); while (p) { // If the memory category for this line is still the catch-all // (i.e. private), then see if the current token is a special // keyword for a specific memory sub-category. if (category == PROCESS_PRIVATE_INDEX) { for (int ix = 0; (ix < PROCESS_PRIVATE_INDEX); ix++) { if (!strncmp(p, process_meminfo[ix].token, strlen(process_meminfo[ix].token))) { category = ix; break; } } } // If the current token is a per-node pages quantity, parse the // node number and accumulate the number of pages in the specific // category (and also add to the total). if (p[0] == 'N') { int node_num = (int)strtol(&p[1], &p, 10); if (p[0] != '=') { perror("node value parse error"); exit(EXIT_FAILURE); } double value = (double)strtol(&p[1], &p, 10); double multiplier = page_size_in_bytes; if (category == PROCESS_HUGE_INDEX) { multiplier = huge_page_size_in_bytes; } value *= multiplier; value /= (double)MEGABYTE; // Add value to data cell, total_col, and total_row int tmp_row; if (show_sub_categories) { tmp_row = header_rows + category; } else { tmp_row = header_rows + pid_ix; } // Don't assume nodes are sequential or contiguous. // Need to find correct tmp_col from node_ix_map int i = 0; while(node_ix_map[i++] != node_num) ; int tmp_col = header_cols + i - 1; double_addto(&table, tmp_row, tmp_col, value); double_addto(&table, tmp_row, total_col_ix, value); double_addto(&table, total_row_ix, tmp_col, value); double_addto(&table, total_row_ix, total_col_ix, value); } // Get next token on the line p = strtok(NULL, delimiters); } } // Currently, a non-root user can open some numa_map files successfully // without error, but can't actually read the contents -- despite the // 444 file permissions. So, use ferror() to check here to see if we // actually got a read error, and if so, alert the user so they know // not to trust the zero in the table. if (ferror(fs)) { sprintf(buf, "Can't read /proc/%d/numa_maps", pid); perror(buf); exit(EXIT_FAILURE); } fclose(fs); // If showing individual tables, or we just added the last total line, // prepare the table for display and display it... if ((show_sub_categories) || (pid_ix + 1 == num_pids)) { // Crompress display column widths, if requested if (compress_display) { for (int col = 0; (col < header_cols + num_nodes + 1); col++) { auto_set_col_width(&table, col, 4, 16); } } else { // Since not compressing the display, allow the left header // column to be wider. Otherwise, sometimes process command // name instance numbers can be truncated in an annoying way. auto_set_col_width(&table, 0, 16, 24); } // Put dashes above Total line... set_row_flag(&table, total_row_ix - 1, COL_FLAG_ALWAYS_SHOW); for (int col = 0; (col < header_cols + num_nodes + 1); col++) { repchar_assign(&table, total_row_ix - 1, col, '-'); } // Optionally sort the table data if (sort_table) { int sort_col; if ((sort_table_node < 0) || (sort_table_node >= num_nodes)) { sort_col = total_col_ix; } else { sort_col = header_cols + node_ix_map[sort_table_node]; } sort_rows_descending_by_col(&table, header_rows, header_rows + data_rows - 1, sort_col); } // Actually show the table display_table(&table, screen_width, 0, 0, show_zero_data, show_zero_data); } } // END OF FOR_EACH-PID loop free_table(&table); } // show_process_info() int node_and_digits(const struct dirent *dptr) { char *p = (char *)(dptr->d_name); if (*p++ != 'n') return 0; if (*p++ != 'o') return 0; if (*p++ != 'd') return 0; if (*p++ != 'e') return 0; do { if (!isdigit(*p++)) return 0; } while (*p != '\0'); return 1; } static void init_node_ix_map_and_header(void) { // Count directory names of the form: /sys/devices/system/node/node<N> struct dirent **namelist; num_nodes = scandir("/sys/devices/system/node", &namelist, node_and_digits, NULL); if (num_nodes < 1) { if (compatibility_mode) { perror("sysfs not mounted or system not NUMA aware"); } else { perror("Couldn't open /sys/devices/system/node"); } exit(EXIT_FAILURE); } else { node_ix_map = malloc(num_nodes * sizeof(int)); if (node_ix_map == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } // For each "node<N>" filename present, save <N> in node_ix_map for (int ix = 0; (ix < num_nodes); ix++) { node_ix_map[ix] = atoi(&namelist[ix]->d_name[4]); free(namelist[ix]); } free(namelist); // Now, sort the node map in increasing order. Use a simplistic sort // since we expect a relatively short (and maybe pre-ordered) list. for (int ix = 0; (ix < num_nodes); ix++) { int smallest_ix = ix; for (int iy = ix + 1; (iy < num_nodes); iy++) { if (node_ix_map[smallest_ix] > node_ix_map[iy]) { smallest_ix = iy; } } if (smallest_ix != ix) { int tmp = node_ix_map[ix]; node_ix_map[ix] = node_ix_map[smallest_ix]; node_ix_map[smallest_ix] = tmp; } } // Construct vector of "Node <N>" and "Total" column headers. Allocate // one for each NUMA node, plus one on the end for the "Total" column node_header = malloc((num_nodes + 1) * sizeof(char *)); if (node_header == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } for (int node_ix = 0; (node_ix <= num_nodes); node_ix++) { char node_label[64]; if (node_ix == num_nodes) { strcpy(node_label, "Total"); } else if (compatibility_mode) { snprintf(node_label, sizeof(node_label), "node%d", node_ix_map[node_ix]); } else { snprintf(node_label, sizeof(node_label), "Node %d", node_ix_map[node_ix]); } char *s = strdup(node_label); if (s == NULL) { perror("malloc failed line: " STRINGIFY(__LINE__)); exit(EXIT_FAILURE); } node_header[node_ix] = s; } } } static void free_node_ix_map_and_header(void) { if (node_ix_map != NULL) { free(node_ix_map); node_ix_map = NULL; } if (node_header != NULL) { for (int ix = 0; (ix <= num_nodes); ix++) { free(node_header[ix]); } free(node_header); node_header = NULL; } } static double get_huge_page_size_in_bytes(void) { double huge_page_size = 0; FILE *fs = fopen("/proc/meminfo", "r"); if (!fs) { perror("Can't open /proc/meminfo"); exit(EXIT_FAILURE); } char buf[SMALL_BUF_SIZE]; while (fgets(buf, SMALL_BUF_SIZE, fs)) { if (!strncmp("Hugepagesize", buf, 12)) { char *p = &buf[12]; while ((!isdigit(*p)) && (p < buf + SMALL_BUF_SIZE)) { p++; } huge_page_size = strtod(p, NULL); break; } } fclose(fs); return huge_page_size * KILOBYTE; } static int all_digits(char *p) { if (p == NULL) { return 0; } while (*p != '\0') { if (!isdigit(*p++)) return 0; } return 1; } static int starts_with_digit(const struct dirent *dptr) { return (isdigit(dptr->d_name[0])); } static void add_pid_to_list(int pid) { if (num_pids < pid_array_max_pids) { pid_array[num_pids++] = pid; } else { if (pid_array_max_pids == 0) { pid_array_max_pids = 32; } int *tmp_int_ptr = realloc(pid_array, 2 * pid_array_max_pids * sizeof(int)); if (tmp_int_ptr == NULL) { char buf[SMALL_BUF_SIZE]; sprintf(buf, "Too many PIDs, skipping %d", pid); perror(buf); } else { pid_array = tmp_int_ptr; pid_array_max_pids *= 2; pid_array[num_pids++] = pid; } } } int ascending(const void *p1, const void *p2) { return *(int *)p1 - *(int *) p2; } static void sort_pids_and_remove_duplicates(void) { if (num_pids > 1) { qsort(pid_array, num_pids, sizeof(int), ascending); int ix1 = 0; for (int ix2 = 1; (ix2 < num_pids); ix2++) { if (pid_array[ix2] == pid_array[ix1]) { continue; } ix1 += 1; if (ix2 > ix1) { pid_array[ix1] = pid_array[ix2]; } } num_pids = ix1 + 1; } } static void add_pids_from_pattern_search(char *pattern) { // Search all /proc/<PID>/cmdline files and /proc/<PID>/status:Name fields // for matching patterns. Show the memory details for matching PIDs. int num_matches_found = 0; struct dirent **namelist; int files = scandir("/proc", &namelist, starts_with_digit, NULL); if (files < 0) { perror("Couldn't open /proc"); } for (int ix = 0; (ix < files); ix++) { char buf[BUF_SIZE]; // First get Name field from status file int pid = atoi(namelist[ix]->d_name); char *p = command_name_for_pid(pid); if (p) { strcpy(buf, p); } else { buf[0] = '\0'; } // Next copy cmdline file contents onto end of buffer. Do it a // character at a time to convert nulls to spaces. char fname[272]; snprintf(fname, sizeof(fname), "/proc/%s/cmdline", namelist[ix]->d_name); FILE *fs = fopen(fname, "r"); if (fs) { p = buf; while (*p != '\0') { p++; } *p++ = ' '; int c; while (((c = fgetc(fs)) != EOF) && (p < buf + BUF_SIZE - 1)) { if (c == '\0') { c = ' '; } *p++ = c; } *p++ = '\0'; fclose(fs); } if (strstr(buf, pattern)) { if (pid != getpid()) { add_pid_to_list(pid); num_matches_found += 1; } } free(namelist[ix]); } free(namelist); if (num_matches_found == 0) { printf("Found no processes containing pattern: \"%s\"\n", pattern); } } int main(int argc, char **argv) { prog_name = argv[0]; int show_the_system_info = 0; int show_the_numastat_info = 0; static struct option long_options[] = { {"help", 0, 0, '?'}, {0, 0, 0, 0} }; int long_option_index = 0; int opt; while ((opt = getopt_long(argc, argv, "cmnp:s::vVz?", long_options, &long_option_index)) != -1) { switch (opt) { case 0: printf("Unexpected long option %s", long_options[long_option_index].name); if (optarg) { printf(" with arg %s", optarg); } printf("\n"); display_usage_and_exit(); break; case 'c': compress_display = 1; break; case 'm': show_the_system_info = 1; break; case 'n': show_the_numastat_info = 1; break; case 'p': if ((optarg) && (all_digits(optarg))) { add_pid_to_list(atoi(optarg)); } else { add_pids_from_pattern_search(optarg); } break; case 's': sort_table = 1; if ((optarg) && (all_digits(optarg))) { sort_table_node = atoi(optarg); } break; case 'v': verbose = 1; break; case 'V': display_version_and_exit(); break; case 'z': show_zero_data = 0; break; default: case '?': display_usage_and_exit(); break; } } // Figure out the display width, which is used to format the tables // and limit the output columns per row screen_width = get_screen_width(); // Any remaining arguments are assumed to be additional process specifiers while (optind < argc) { if (all_digits(argv[optind])) { add_pid_to_list(atoi(argv[optind])); } else { add_pids_from_pattern_search(argv[optind]); } optind += 1; } // If there are no program options or arguments, be extremely compatible // with the old numastat perl script (which is included at the end of this // file for reference) compatibility_mode = (argc == 1); init_node_ix_map_and_header(); // enumarate the NUMA nodes if (compatibility_mode) { show_numastat_info(); free_node_ix_map_and_header(); exit(EXIT_SUCCESS); } // Figure out page sizes page_size_in_bytes = (double)sysconf(_SC_PAGESIZE); huge_page_size_in_bytes = get_huge_page_size_in_bytes(); // Display the info for the process specifiers if (num_pids > 0) { sort_pids_and_remove_duplicates(); show_process_info(); } if (pid_array != NULL) { free(pid_array); } // Display the system-wide memory usage info if (show_the_system_info) { show_system_info(); } // Display the numastat statistics info if ((show_the_numastat_info) || ((num_pids == 0) && (!show_the_system_info))) { show_numastat_info(); } free_node_ix_map_and_header(); exit(EXIT_SUCCESS); } #if 0 /* #!/usr/bin/perl # Print numa statistics for all nodes # Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. # # numastat is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; version # 2. # # numastat is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # You should find a copy of v2 of the GNU General Public License somewhere # on your Linux system; if not, write to the Free Software Foundation, # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Example: NUMASTAT_WIDTH=80 watch -n1 numastat # # output width $WIDTH=80; if (defined($ENV{'NUMASTAT_WIDTH'})) { $WIDTH=$ENV{'NUMASTAT_WIDTH'}; } else { use POSIX; if (POSIX::isatty(fileno(STDOUT))) { if (open(R, "resize |")) { while (<R>) { $WIDTH=$1 if /COLUMNS=(\d+)/; } close R; } } else { # don't split it up for easier parsing $WIDTH=10000000; } } $WIDTH = 32 if $WIDTH < 32; if (! -d "/sys/devices/system/node" ) { print STDERR "sysfs not mounted or system not NUMA aware\n"; exit 1; } %stat = (); $title = ""; $mode = 0; opendir(NODES, "/sys/devices/system/node") || exit 1; foreach $nd (readdir(NODES)) { next unless $nd =~ /node(\d+)/; # On newer kernels, readdir may enumerate the 'node(\d+) subdirs # in opposite order from older kernels--e.g., node{0,1,2,...} # as opposed to node{N,N-1,N-2,...}. Accommodate this by # switching to new mode so that the stats get emitted in # the same order. #print "readdir(NODES) returns $nd\n"; if (!$title && $nd =~ /node0/) { $mode = 1; } open(STAT, "/sys/devices/system/node/$nd/numastat") || die "cannot open $nd: $!\n"; if (! $mode) { $title = sprintf("%16s",$nd) . $title; } else { $title = $title . sprintf("%16s",$nd); } @fields = (); while (<STAT>) { ($name, $val) = split; if (! $mode) { $stat{$name} = sprintf("%16u", $val) . $stat{$name}; } else { $stat{$name} = $stat{$name} . sprintf("%16u", $val); } push(@fields, $name); } close STAT; } closedir NODES; $numfields = int(($WIDTH - 16) / 16); $l = 16 * $numfields; for ($i = 0; $i < length($title); $i += $l) { print "\n" if $i > 0; printf "%16s%s\n","",substr($title,$i,$l); foreach (@fields) { printf "%-16s%s\n",$_,substr($stat{$_},$i,$l); } } */ #endif 07070100000030000081A40000000000000000000000016319106A00000839000000000000000000000000000000000000002600000000numactl-* Simple LPGLed rtnetlink library */ #include <sys/socket.h> #include <linux/rtnetlink.h> #include <linux/netlink.h> #include <netinet/in.h> #include <errno.h> #include <unistd.h> #define hidden __attribute__((visibility("hidden"))) #include "rtnetlink.h" hidden void *rta_put(struct nlmsghdr *m, int type, int len) { struct rtattr *rta = (void *)m + NLMSG_ALIGN(m->nlmsg_len); int rtalen = RTA_LENGTH(len); rta->rta_type = type; rta->rta_len = rtalen; m->nlmsg_len = NLMSG_ALIGN(m->nlmsg_len) + RTA_ALIGN(rtalen); return RTA_DATA(rta); } hidden struct rtattr *rta_get(struct nlmsghdr *m, struct rtattr *p, int offset) { struct rtattr *rta; if (p) { rta = RTA_NEXT(p, m->nlmsg_len); if (!RTA_OK(rta, m->nlmsg_len)) return NULL; } else { rta = (void *)m + NLMSG_ALIGN(offset); } return rta; } hidden int rta_put_address(struct nlmsghdr *msg, int type, struct sockaddr *adr) { switch (adr->sa_family) { case AF_INET: { struct in_addr *i = rta_put(msg, type, 4); *i = ((struct sockaddr_in *)adr)->sin_addr; break; } case AF_INET6: { struct in6_addr *i6 = rta_put(msg, type, 16); *i6 = ((struct sockaddr_in6 *)adr)->sin6_addr; break; } default: return -1; } return 0; } /* Assumes no truncation. Make the buffer large enough. */ hidden int rtnetlink_request(struct nlmsghdr *msg, int buflen, struct sockaddr_nl *adr) { int rsk; int n; int e; /* Use a private socket to avoid having to keep state for a sequence number. */ rsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (rsk < 0) return -1; n = sendto(rsk, msg, msg->nlmsg_len, 0, (struct sockaddr *)adr, sizeof(struct sockaddr_nl)); if (n >= 0) { socklen_t adrlen = sizeof(struct sockaddr_nl); n = recvfrom(rsk, msg, buflen, 0, (struct sockaddr *)adr, &adrlen); } e = errno; close(rsk); errno = e; if (n < 0) return -1; /* Assume we only get a single reply back. This is (hopefully?) safe because it's a single use socket. */ if (msg->nlmsg_type == NLMSG_ERROR) { struct nlmsgerr *err = NLMSG_DATA(msg); errno = -err->error; return -1; } return 0; } 07070100000031000081A40000000000000000000000016319106A00000139000000000000000000000000000000000000002600000000numactl- int rta_put_address(struct nlmsghdr *msg, int type, struct sockaddr *adr); hidden struct rtattr *rta_get(struct nlmsghdr *m, struct rtattr *p, int offset); hidden void *rta_put(struct nlmsghdr *m, int type, int len); hidden int rtnetlink_request(struct nlmsghdr *msg, int buflen, struct sockaddr_nl *adr); 07070100000032000081A40000000000000000000000016319106A000020A8000000000000000000000000000000000000002000000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. Manage shared memory policy for numactl. The actual policy is set in numactl itself, this just sets up and maps the shared memory segments and dumps them. numactl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numactl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _GNU_SOURCE 1 #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <sys/ipc.h> #include <sys/shm.h> #include <sys/fcntl.h> #include <sys/stat.h> #include <stdarg.h> #include <errno.h> #include <unistd.h> #include "numa.h" #include "numaif.h" #include "numaint.h" #include "util.h" #include "shm.h" int shmfd = -1; long shmid = 0; char *shmptr; unsigned long long shmlen; mode_t shmmode = 0600; unsigned long long shmoffset; int shmflags; static int shm_pagesize; static long huge_page_size(void) { size_t len = 0; long huge_size = 0; char *line = NULL; FILE *f = fopen("/proc/meminfo", "r"); if (f != NULL) { while (getdelim(&line, &len, '\n', f) > 0) { int ps; if (sscanf(line, "Hugepagesize: %d kB", &ps) == 1) { huge_size = ps * 1024; break; } } free(line); fclose(f); } return huge_size ? huge_size : getpagesize(); } static void check_region(char *opt) { if (((unsigned long)shmptr % shm_pagesize) || (shmlen % shm_pagesize)) { fprintf(stderr, "numactl: policy region not page aligned\n"); exit(1); } if (!shmlen) { fprintf(stderr, "numactl: policy region length not specified before %s\n", opt); exit(1); } } static key_t sysvkey(char *name) { int fd; key_t key = ftok(name, shmid); if (key >= 0) return key; fprintf(stderr, "numactl: Creating shm key file %s mode %04o\n", name, shmmode); fd = creat(name, shmmode); if (fd < 0) nerror("cannot create key for shm %s\n", name); key = ftok(name, shmid); if (key < 0) nerror("cannot get key for newly created shm key file %s", name); return key; } /* Attach a sysv style shared memory segment. */ void attach_sysvshm(char *name, char *opt) { struct shmid_ds s; key_t key = sysvkey(name); shmfd = shmget(key, shmlen, shmflags); if (shmfd < 0 && errno == ENOENT) { if (shmlen == 0) complain( "need a --length to create a sysv shared memory segment"); fprintf(stderr, "numactl: Creating shared memory segment %s id %ld mode %04o length %.fMB\n", name, shmid, shmmode, ((double)(shmlen + shmoffset)) / (1024*1024) ); shmfd = shmget(key, shmlen + shmoffset, IPC_CREAT|shmmode|shmflags); if (shmfd < 0) nerror("cannot create shared memory segment"); } if (shmlen == 0) { if (shmctl(shmfd, IPC_STAT, &s) < 0) err("shmctl IPC_STAT"); shmlen = s.shm_segsz; } shmptr = shmat(shmfd, NULL, 0); if (shmptr == (void*)-1) err("shmat"); shmptr += shmoffset; shm_pagesize = (shmflags & SHM_HUGETLB) ? huge_page_size() : getpagesize(); check_region(opt); } /* Attach a shared memory file. */ void attach_shared(char *name, char *opt) { struct stat64 st; shmfd = open(name, O_RDWR); if (shmfd < 0) { errno = 0; if (shmlen == 0) complain("need a --length to create a shared file"); shmfd = open(name, O_RDWR|O_CREAT, shmmode); if (shmfd < 0) nerror("cannot create file %s", name); } if (fstat64(shmfd, &st) < 0) err("shm stat"); /* the file size must be larger than mmap shmlen + shmoffset, otherwise SIGBUS * will be caused when we access memory, because mmaped memory is no longer in * the range of the file laster. */ if ((shmlen + shmoffset) > st.st_size) { if (ftruncate64(shmfd, shmlen + shmoffset) < 0) { /* XXX: we could do it by hand, but it would it would be impossible to apply policy then. need to fix that in the kernel. */ perror("ftruncate"); exit(1); } } shm_pagesize = st.st_blksize; check_region(opt); /* RED-PEN For shmlen > address space may need to map in pieces. Left for some poor 32bit soul. */ shmptr = mmap64(NULL, shmlen, PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, shmoffset); if (shmptr == (char*)-1) err("shm mmap"); } static void dumppol(unsigned long long start, unsigned long long end, int pol, struct bitmask *mask) { if (pol == MPOL_DEFAULT) return; printf("%016llx-%016llx: %s ", shmoffset+start, shmoffset+end, policy_name(pol)); printmask("", mask); } /* Dump policies in a shared memory segment. */ void dump_shm(void) { struct bitmask *nodes, *prevnodes, *tag; int prevpol = -1, pol; unsigned long long c, start; start = 0; if (shmlen == 0) { printf("nothing to dump\n"); return; } nodes = numa_allocate_nodemask(); tag = prevnodes = numa_allocate_nodemask(); for (c = 0; c < shmlen; c += shm_pagesize) { if (get_mempolicy(&pol, nodes->maskp, nodes->size, c+shmptr, MPOL_F_ADDR) < 0) err("get_mempolicy on shm"); if (pol == prevpol) continue; if (prevpol != -1) dumppol(start, c, prevpol, prevnodes); prevnodes = nodes; prevpol = pol; start = c; } dumppol(start, c, prevpol, prevnodes); numa_free_nodemask(nodes); numa_free_nodemask(tag); } static void dumpnode(unsigned long long start, unsigned long long end, int node) { printf("%016llx-%016llx: %d\n", shmoffset+start, shmoffset+end, node); } /* Dump nodes in a shared memory segment. */ void dump_shm_nodes(void) { int prevnode = -1, node; unsigned long long c, start; start = 0; if (shmlen == 0) { printf("nothing to dump\n"); return; } for (c = 0; c < shmlen; c += shm_pagesize) { if (get_mempolicy(&node, NULL, 0, c+shmptr, MPOL_F_ADDR|MPOL_F_NODE) < 0) err("get_mempolicy on shm"); if (node == prevnode) continue; if (prevnode != -1) dumpnode(start, c, prevnode); prevnode = node; start = c; } dumpnode(start, c, prevnode); } static void vwarn(char *ptr, char *fmt, ...) { va_list ap; unsigned long off = (unsigned long)ptr - (unsigned long)shmptr; va_start(ap,fmt); printf("numactl verify %lx(%lx): ", (unsigned long)ptr, off); vprintf(fmt, ap); va_end(ap); exitcode = 1; } static unsigned interleave_next(unsigned cur, struct bitmask *mask) { int numa_num_nodes = numa_num_possible_nodes(); ++cur; while (!numa_bitmask_isbitset(mask, cur)) { cur = (cur+1) % numa_num_nodes; } return cur; } /* Verify policy in a shared memory segment */ void verify_shm(int policy, struct bitmask *nodes) { char *p; int ilnode, node; int pol2; struct bitmask *nodes2; if (policy == MPOL_INTERLEAVE) { if (get_mempolicy(&ilnode, NULL, 0, shmptr, MPOL_F_ADDR|MPOL_F_NODE) < 0) err("get_mempolicy"); } nodes2 = numa_allocate_nodemask(); for (p = shmptr; p - (char *)shmptr < shmlen; p += shm_pagesize) { if (get_mempolicy(&pol2, nodes2->maskp, nodes2->size, p, MPOL_F_ADDR) < 0) err("get_mempolicy"); if (pol2 != policy) { vwarn(p, "wrong policy %s, expected %s\n", policy_name(pol2), policy_name(policy)); goto out; } if (memcmp(nodes2->maskp, nodes->maskp, numa_bitmask_nbytes(nodes))) { vwarn(p, "mismatched node mask\n"); printmask("expected", nodes); printmask("real", nodes2); } if (get_mempolicy(&node, NULL, 0, p, MPOL_F_ADDR|MPOL_F_NODE) < 0) err("get_mempolicy"); switch (policy) { case MPOL_INTERLEAVE: if (node < 0 || !numa_bitmask_isbitset(nodes2, node)) vwarn(p, "interleave node out of range %d\n", node); if (node != ilnode) { vwarn(p, "expected interleave node %d, got %d\n", ilnode,node); goto out; } ilnode = interleave_next(ilnode, nodes2); break; case MPOL_PREFERRED_MANY: case MPOL_PREFERRED: case MPOL_BIND: if (!numa_bitmask_isbitset(nodes2, node)) { vwarn(p, "unexpected node %d\n", node); printmask("expected", nodes2); } break; case MPOL_DEFAULT: break; } } out: numa_free_nodemask(nodes2); } 07070100000033000081A40000000000000000000000016319106A000001A3000000000000000000000000000000000000002000000000numactl- extern int shmfd; extern long shmid; extern char *shmptr; extern unsigned long long shmlen; extern mode_t shmmode; extern unsigned long long shmoffset; extern int shmflags; extern void dump_shm(void); extern void dump_shm_nodes(void); extern void attach_shared(char *, char *); extern void attach_sysvshm(char *, char *); extern void verify_shm(int policy, struct bitmask *); /* in numactl.c */ extern int exitcode; 07070100000034000081A40000000000000000000000016319106A000018B9000000000000000000000000000000000000002700000000numactl- <stdio.h> #include <math.h> #include <float.h> #include <sys/time.h> #include <stdlib.h> #include "stream_lib.h" static inline double mysecond(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + tv.tv_usec * 1.e-6; } /* * Program: Stream * Programmer: Joe R. Zagar * Revision: 4.0-BETA, October 24, 1995 * Original code developed by John D. McCalpin * * This program measures memory transfer rates in MB/s for simple * computational kernels coded in C. These numbers reveal the quality * of code generation for simple uncacheable kernels as well as showing * the cost of floating-point operations relative to memory accesses. * * INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. * * Hacked by AK to be a library */ long N = 8000000; #define NTIMES 10 #define OFFSET 0 /* * 3) Compile the code with full optimization. Many compilers * generate unreasonably bad code before the optimizer tightens * things up. If the results are unreasonably good, on the * other hand, the optimizer might be too smart for me! * * Try compiling with: * cc -O stream_d.c second_wall.c -o stream_d -lm * * This is known to work on Cray, SGI, IBM, and Sun machines. * * * 4) Mail the results to * Be sure to include: * a) computer hardware model number and software revision * b) the compiler flags * c) all of the output from the test case. * Thanks! * */ static int checktick(void); # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static double *a, *b, *c; static double rmstime[4] = { 0 }, maxtime[4] = { 0}, mintime[4] = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; static char *label[4] = { "Copy: ", "Scale: ", "Add: ", "Triad: " }; char *stream_names[] = { "Copy","Scale","Add","Triad" }; static double bytes[4]; int stream_verbose = 1; #define Vprintf(x...) do { if (stream_verbose) printf(x); } while(0) void stream_check(void) { int quantum; int BytesPerWord; register int j; double t; /* --- SETUP --- determine precision and check timing --- */ Vprintf(HLINE); BytesPerWord = sizeof(double); Vprintf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); Vprintf(HLINE); Vprintf("Array size = %lu, Offset = %d\n", N, OFFSET); Vprintf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); Vprintf("Each test is run %d times, but only\n", NTIMES); Vprintf("the *best* time for each is used.\n"); /* Get initial value for system clock. */ for (j = 0; j < N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } Vprintf(HLINE); if ((quantum = checktick()) >= 1) Vprintf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else Vprintf("Your clock granularity appears to be " "less than one microsecond.\n"); t = mysecond(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); Vprintf("Each test below will take on the order" " of %d microseconds.\n", (int) t); Vprintf(" (= %d clock ticks)\n", (int) (t / quantum)); Vprintf("Increase the size of the arrays if this shows that\n"); Vprintf("you are not getting at least 20 clock ticks per test.\n"); Vprintf(HLINE); Vprintf("WARNING -- The above is only a rough guideline.\n"); Vprintf("For best results, please be sure you know the\n"); Vprintf("precision of your system timer.\n"); Vprintf(HLINE); } void stream_test(double *res) { register int j, k; double scalar, times[4][NTIMES]; /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k = 0; k < NTIMES; k++) { times[0][k] = mysecond(); for (j = 0; j < N; j++) c[j] = a[j]; times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); for (j = 0; j < N; j++) b[j] = scalar * c[j]; times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); for (j = 0; j < N; j++) c[j] = a[j] + b[j]; times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j]; times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k = 0; k < NTIMES; k++) { for (j = 0; j < 4; j++) { rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } Vprintf ("Function Rate (MB/s) RMS time Min time Max time\n"); for (j = 0; j < 4; j++) { double speed = 1.0E-06 * bytes[j] / mintime[j]; rmstime[j] = sqrt(rmstime[j] / (double) NTIMES); Vprintf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], speed, rmstime[j], mintime[j], maxtime[j]); if (res) res[j] = speed; } } # define M 20 static int checktick(void) { int i, minDelta, Delta; double t1, t2, timesfound[M]; /* Collect a sequence of M unique time values from the system. */ for (i = 0; i < M; i++) { t1 = mysecond(); while (((t2 = mysecond()) - t1) < 1.0E-6); timesfound[i] = t1 = t2; } /* * Determine the minimum difference between these M values. * This result will be our estimate (in microseconds) for the * clock granularity. */ minDelta = 1000000; for (i = 1; i < M; i++) { Delta = (int) (1.0E6 * (timesfound[i] - timesfound[i - 1])); minDelta = MIN(minDelta, MAX(Delta, 0)); } return (minDelta); } void stream_setmem(unsigned long size) { N = (size - OFFSET) / (3*sizeof(double)); } long stream_memsize(void) { return 3*(sizeof(double) * (N+OFFSET)) ; } long stream_init(void *mem) { int i; for (i = 0; i < 4; i++) { rmstime[i] = 0; maxtime[i] = 0; mintime[i] = FLT_MAX; } bytes[0] = 2 * sizeof(double) * N; bytes[1] = 2 * sizeof(double) * N; bytes[2] = 3 * sizeof(double) * N; bytes[3] = 3 * sizeof(double) * N; a = mem; b = (double *)mem + (N+OFFSET); c = (double *)mem + 2*(N+OFFSET); stream_check(); return 0; } 07070100000035000081A40000000000000000000000016319106A000000EA000000000000000000000000000000000000002700000000numactl- stream_memsize(void); long stream_init(void *mem); #define STREAM_NRESULTS 4 void stream_test(double *res); void stream_check(void); void stream_setmem(unsigned long size); extern int stream_verbose; extern char *stream_names[]; 07070100000036000081A40000000000000000000000016319106A00000345000000000000000000000000000000000000002800000000numactl- <stdio.h> #include <sys/mman.h> #include <stdlib.h> #include "numa.h" #include "numaif.h" #include "util.h" #include "stream_lib.h" static void usage(void) { exit(1); } /* Run STREAM with a numa policy */ int main(int ac, char **av) { struct bitmask *nodes; char *map; long size; int policy; policy = parse_policy(av[1], av[2]); if (policy == MPOL_MAX) usage(); nodes = numa_allocate_nodemask(); if (av[1] && av[2]) nodes = numa_parse_nodestring(av[2]); if (!nodes) { printf ("<%s> is invalid\n", av[2]); exit(1); } size = stream_memsize(); map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); if (map == (char*)-1) exit(1); if (mbind(map, size, policy, nodes->maskp, nodes->size, 0) < 0) perror("mbind"), exit(1); stream_init(map); stream_test(NULL); return 0; } 07070100000037000081A40000000000000000000000016319106A00001E33000000000000000000000000000000000000002400000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <unistd.h> #include <sys/types.h> #include <asm/unistd.h> #include <errno.h> #include "numa.h" #include "numaif.h" #include "numaint.h" #include "config.h" #include "util.h" #define WEAK __attribute__((weak)) #if !defined(__NR_mbind) || !defined(__NR_set_mempolicy) || \ !defined(__NR_get_mempolicy) || !defined(__NR_migrate_pages) || \ !defined(__NR_move_pages) #if defined(__x86_64__) #define __NR_sched_setaffinity 203 #define __NR_sched_getaffinity 204 /* Official allocation */ #define __NR_mbind 237 #define __NR_set_mempolicy 238 #define __NR_get_mempolicy 239 #define __NR_migrate_pages 256 #define __NR_move_pages 279 #elif defined(__ia64__) #define __NR_sched_setaffinity 1231 #define __NR_sched_getaffinity 1232 #define __NR_migrate_pages 1280 #define __NR_move_pages 1276 /* Official allocation */ #define __NR_mbind 1259 #define __NR_get_mempolicy 1260 #define __NR_set_mempolicy 1261 #elif defined(__i386__) #define __NR_mbind 274 #define __NR_get_mempolicy 275 #define __NR_set_mempolicy 276 #define __NR_migrate_pages 294 #define __NR_move_pages 317 #elif defined(__powerpc__) #define __NR_mbind 259 #define __NR_get_mempolicy 260 #define __NR_set_mempolicy 261 #define __NR_migrate_pages 258 /* FIXME: powerpc is missing move pages!!! #define __NR_move_pages xxx */ #elif defined(__loongarch__) //reference to /usr/include/asm-generic/unistd.h #define __NR_mbind 235 #define __NR_get_mempolicy 236 #define __NR_set_mempolicy 237 #define __NR_migrate_pages 238 #define __NR_move_pages 239 #elif defined(__mips__) #if _MIPS_SIM == _ABIO32 /* * Linux o32 style syscalls are in the range from 4000 to 4999. */ #define __NR_Linux 4000 #define __NR_mbind (__NR_Linux + 268) #define __NR_get_mempolicy (__NR_Linux + 269) #define __NR_set_mempolicy (__NR_Linux + 270) #define __NR_migrate_pages (__NR_Linux + 287) #endif #if _MIPS_SIM == _ABI64 /* * Linux 64-bit syscalls are in the range from 5000 to 5999. */ #define __NR_Linux 5000 #define __NR_mbind (__NR_Linux + 227) #define __NR_get_mempolicy (__NR_Linux + 228) #define __NR_set_mempolicy (__NR_Linux + 229) #define __NR_migrate_pages (__NR_Linux + 246) #endif #if _MIPS_SIM == _ABIN32 /* * Linux N32 syscalls are in the range from 6000 to 6999. */ #define __NR_Linux 6000 #define __NR_mbind (__NR_Linux + 231) #define __NR_get_mempolicy (__NR_Linux + 232) #define __NR_set_mempolicy (__NR_Linux + 233) #define __NR_migrate_pages (__NR_Linux + 250) #endif #elif defined(__hppa__) #define __NR_migrate_pages 272 #elif defined(__arm__) /* */ #warning "ARM does not implement the migrate_pages() syscall" #elif defined(__s390x__) #define __NR_mbind 268 #define __NR_get_mempolicy 269 #define __NR_set_mempolicy 270 #define __NR_migrate_pages 287 #define __NR_move_pages 310 #elif !defined(DEPS_RUN) #error "Add syscalls for your architecture or update kernel headers" #endif #endif #ifndef __GLIBC_PREREQ # define __GLIBC_PREREQ(x,y) 0 #endif #if defined(__GLIBC__) && __GLIBC_PREREQ(2, 11) /* glibc 2.11 seems to have working 6 argument sycall. Use the glibc supplied syscall in this case. The version cut-off is rather arbitrary and could be probably earlier. */ #define syscall6 syscall #elif defined(__x86_64__) /* 6 argument calls on x86-64 are often buggy in both glibc and asm/unistd.h. Add a working version here. */ long syscall6(long call, long a, long b, long c, long d, long e, long f) { long res; asm volatile ("movq %[d],%%r10 ; movq %[e],%%r8 ; movq %[f],%%r9 ; syscall" : "=a" (res) : "0" (call),"D" (a),"S" (b), "d" (c), [d] "g" (d), [e] "g" (e), [f] "g" (f) : "r11","rcx","r8","r10","r9","memory" ); if (res < 0) { errno = -res; res = -1; } return res; } #elif defined(__i386__) /* i386 has buggy syscall6 in glibc too. This is tricky to do in inline assembly because it clobbers so many registers. Do it out of line. */ asm( "__syscall6:\n" " pushl %ebp\n" " pushl %edi\n" " pushl %esi\n" " pushl %ebx\n" " movl (0+5)*4(%esp),%eax\n" " movl (1+5)*4(%esp),%ebx\n" " movl (2+5)*4(%esp),%ecx\n" " movl (3+5)*4(%esp),%edx\n" " movl (4+5)*4(%esp),%esi\n" " movl (5+5)*4(%esp),%edi\n" " movl (6+5)*4(%esp),%ebp\n" " int $0x80\n" " popl %ebx\n" " popl %esi\n" " popl %edi\n" " popl %ebp\n" " ret" ); extern long __syscall6(long n, long a, long b, long c, long d, long e, long f); long syscall6(long call, long a, long b, long c, long d, long e, long f) { long res = __syscall6(call,a,b,c,d,e,f); if (res < 0) { errno = -res; res = -1; } return res; } #else #define syscall6 syscall #endif long WEAK get_mempolicy(int *policy, unsigned long *nmask, unsigned long maxnode, void *addr, unsigned flags) { return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags); } long WEAK mbind(void *start, unsigned long len, int mode, const unsigned long *nmask, unsigned long maxnode, unsigned flags) { return syscall6(__NR_mbind, (long)start, len, mode, (long)nmask, maxnode, flags); } long WEAK set_mempolicy(int mode, const unsigned long *nmask, unsigned long maxnode) { long i; i = syscall(__NR_set_mempolicy,mode,nmask,maxnode); return i; } long WEAK migrate_pages(int pid, unsigned long maxnode, const unsigned long *frommask, const unsigned long *tomask) { #if defined(__NR_migrate_pages) return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); #else errno = ENOSYS; return -1; #endif } long WEAK move_pages(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags) { return syscall(__NR_move_pages, pid, count, pages, nodes, status, flags); } /* SLES8 glibc doesn't define those */ SYMVER("numa_sched_setaffinity_v1", "numa_sched_setaffinity@libnuma_1.1") int numa_sched_setaffinity_v1(pid_t pid, unsigned len, const unsigned long *mask) { return syscall(__NR_sched_setaffinity,pid,len,mask); } SYMVER("numa_sched_setaffinity_v2", "numa_sched_setaffinity@@libnuma_1.2") int numa_sched_setaffinity_v2(pid_t pid, struct bitmask *mask) { return syscall(__NR_sched_setaffinity, pid, numa_bitmask_nbytes(mask), mask->maskp); } SYMVER("numa_sched_getaffinity_v1", "numa_sched_getaffinity@libnuma_1.1") int numa_sched_getaffinity_v1(pid_t pid, unsigned len, const unsigned long *mask) { return syscall(__NR_sched_getaffinity,pid,len,mask); } SYMVER("numa_sched_getaffinity_v2", "numa_sched_getaffinity@@libnuma_1.2") int numa_sched_getaffinity_v2(pid_t pid, struct bitmask *mask) { /* len is length in bytes */ return syscall(__NR_sched_getaffinity, pid, numa_bitmask_nbytes(mask), mask->maskp); /* sched_getaffinity returns sizeof(cpumask_t) */ } make_internal_alias(numa_sched_getaffinity_v1); make_internal_alias(numa_sched_getaffinity_v2); make_internal_alias(numa_sched_setaffinity_v1); make_internal_alias(numa_sched_setaffinity_v2); 07070100000038000081A40000000000000000000000016319106A000004B6000000000000000000000000000000000000002200000000numactl-* Utility functions for reading sysfs values */ #define _GNU_SOURCE 1 #include <stdio.h> #include <sys/fcntl.h> #include <stdlib.h> #include <unistd.h> #include <stdarg.h> #include <ctype.h> #include "numa.h" #include "numaint.h" #define SYSFS_BLOCK 4096 hidden char *sysfs_read(char *name) { char *buf; int n; int fd; buf = malloc(SYSFS_BLOCK); if (!buf) return NULL; fd = open(name, O_RDONLY); n = read(fd, buf, SYSFS_BLOCK - 1); close(fd); if (n <= 0) { free(buf); return NULL; } buf[n] = 0; return buf; } hidden int sysfs_node_read(struct bitmask *mask, char *fmt, ...) { int n, ret = 0; va_list ap; char *p, *fn, *m, *end; int num; va_start(ap, fmt); n = vasprintf(&fn, fmt, ap); va_end(ap); if (n < 0) return -1; p = sysfs_read(fn); free(fn); if (!p) return -1; m = p; do { num = strtol(m, &end, 0); if (m == end) { ret = -1; goto out; } if (num < 0) { ret = -2; goto out; } if (num >= numa_num_task_nodes()) { ret = -1; goto out; } numa_bitmask_setbit(mask, num); /* Continuation not supported by kernel yet. */ m = end; while (isspace(*m) || *m == ',') m++; } while (isdigit(*m)); out: free(p); return ret; } 07070100000039000081A40000000000000000000000016319106A00000077000000000000000000000000000000000000002200000000numactl- bitmask; hidden char *sysfs_read(char *name); hidden int sysfs_node_read(struct bitmask *mask, char *fmt, ...); 0707010000003A000041ED0000000000000000000000016319106A00000000000000000000000000000000000000000000001F00000000numactl- Various simple test scripts to verify some parts of the NUMA API. To do a full regression test run make test You should have at least two nodes on a NUMA system for the test suite. The tests in regress assume that there is enough memory free on nodes 0/1. They consider PREFERRED/INTERLEAVE not hitting the first choice node an error. They also require a relatively idle machine to avoid too much noise from memory allocation from other processes. Without that regress1 might fail. You can run the tests under valgrind with VALGRIND=valgrind make test Older valgrind versions incorrectly report a uninitialized byte error on set_mempolicy. That is a false positive. TBD: more detailed unit tests for mbind / shm / {get,set}_mempolicy Currently everything is tested using numactl only. 0707010000003C000081ED0000000000000000000000016319106A000009B4000000000000000000000000000000000000002A00000000numactl-!/bin/bash # This simple script checks --all/-a option which is used for # suppressing of default cpuset awareness of options --cpunodebind, # --physcpubind, --interleave, --preferred and --membind. # NOTE: Test needs two nodes and two cpus at least testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH export old_mask eval_test() { # echo "Running $1.." $1 if [ $? == 1 ] ; then echo -e "$1 FAILED!" reset_mask exit 1 fi echo -e "$1 PASSED" } function check_arg_order { numactl --all --physcpubind=$HIGHESTCPU ls > /dev/null 2>&1 if [ $? == 1 ] ; then return 1; fi numactl --physcpubind=$HIGHESTCPU --all ls > /dev/null 2>&1 if [ $? == 0 ] ; then return 1; fi return 0 } function check_physcpubind { reset_mask set_cpu_affinity 0 numactl --physcpubind=$HIGHESTCPU ls > /dev/null 2>&1 if [ $? == 0 ] ; then # shouldn't pass so easy return 1; fi numactl --all --physcpubind=$HIGHESTCPU ls > /dev/null 2>&1 if [ $? == 1 ] ; then # shouldn't fail return 1; fi return 0 } function check_cpunodebind { local low_cpu_range local high_cpu reset_mask low_cpu_range=$(cat /sys/devices/system/node/node$LOWESTNODE/cpulist) set_cpu_affinity $low_cpu_range numactl --cpunodebind=$HIGHESTNODE ls > /dev/null 2>&1 if [ $? == 1 ] ; then # should pass return 1; fi numactl --all --cpunodebind=$HIGHESTNODE ls > /dev/null 2>&1 if [ $? == 1 ] ; then # should pass for sure return 1; fi return 0 } function set_cpu_affinity { taskset -p -c $1 $$ > /dev/null #echo -e "\taffinity of shell was set to" $1 } function get_mask { old_mask=$(taskset -p $$ | cut -f2 -d: | sed -e 's/^[ \t]*//') } function reset_mask { taskset -p $old_mask $$ > /dev/null #echo -e "\taffinity of shell was reset to" $old_mask } ARCH=`uname -m` if [ ${ARCH} != "s390x" ]; then HIGHESTCPU=$(grep 'processor' /proc/cpuinfo | tail -n1 | cut -f2 -d':') else HIGHESTCPU=$(grep 'processor' /proc/cpuinfo | tail -n1 | cut -f2 | sed 's/://' ) fi HIGHESTCPU=$(echo $HIGHESTCPU | cut -f2 -d' ') HIGHESTNODE=$(numactl -H | grep -Pzo 'node [0-9]* cpus: [0-9].*(.|\n)node [0-9]* size: [1-9].* MB' | tail -n1 | cut -f2 -d' ') LOWESTNODE=$(numactl -H | grep -Pzo 'node [0-9]* cpus: [0-9].*(.|\n)node [0-9]* size: [1-9].* MB' | head -n1 | cut -f2 -d' ') get_mask eval_test check_arg_order eval_test check_physcpubind eval_test check_cpunodebind reset_mask exit 0 0707010000003D000081ED0000000000000000000000016319106A0000035A000000000000000000000000000000000000002D00000000numactl-!/bin/bash # check if affinity works testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH S=`numactl --show | grep nodebind:` NODES=`echo $S | sed -e "s/nodebind://"` S=`numactl --show | grep physcpubind:` CPUS=`echo $S | sed -e "s/physcpubind://"` for i in $CPUS ; do if [ "$(numactl --physcpubind=$i "${testdir}"/printcpu)" != "$i" ] ; then echo "--physcpubind for $i doesn't work" exit 1 fi if [ "$(numactl --physcpubind=$i numactl --show | awk '/^physcpubind/ { print $2 }' )" != "$i" ] ; then echo "--show doesn't agree with physcpubind for cpu $i" exit 1 fi done for i in $NODES ; do if [ $(numactl --cpunodebind=$i numactl --show | awk '/nodebind/ { print $2 }' ) != $i ] ; then echo "--show doesn't agree with cpunodebind for node $i" exit 1 fi done 0707010000003E000081ED0000000000000000000000016319106A00000596000000000000000000000000000000000000002D00000000numactl-!/bin/bash # check numactl --hardware output # this checks most of the topology discovery in libnuma testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH numcpus=$(grep -c processor /proc/cpuinfo) numnodes=$(ls -1d /sys/devices/system/node/node[0-9]* | wc -l) nccpus=$(numactl --hardware | grep cpus | sed 's/node.*cpus://' | wc -w ) ncnodes=$(numactl --hardware | grep -c 'node.*size' ) node_has_cpus="" if [ $numnodes != $ncnodes ] ; then echo "numactl --hardware doesnt report all nodes" exit 1 fi if [ $numcpus != $nccpus -a \( $[$nccpus / $numnodes] != $numcpus \) ] ; then echo "numactl --hardware cpus look bogus" exit 1 fi if [ -s /sys/devices/system/node/has_cpu ]; then node_has_cpus=$(cat /sys/devices/system/node/has_cpu | sed 's/,/ /') fi numactl --hardware | grep cpus | while read n ; do node=${n/ cpus*/} node=${node/ /} cpus=${n/*: /} check_node=$(echo $node | sed 's/node//') if [[ -n ${node_has_cpus} ]]; then if ! [[ "${node_has_cpus}" == *"$check_node"* ]]; then echo "Skipping cpu less $node" continue fi fi k=0 for i in $cpus ; do if [ ! -h "/sys/devices/system/node/$node/cpu$i" ] ; then echo "$node doesn't have cpu $i" exit 1 fi k=$[$k+1] done if [ $k != $(echo $cpus | wc -w) ] ; then echo "$node missing cpu" exit 1 fi done 0707010000003F000081A40000000000000000000000016319106A00000411000000000000000000000000000000000000002A00000000numactl-* Test numa_distance */ #include <numa.h> #include <stdio.h> #include <stdlib.h> int main(void) { int maxnode, a, b, got_nodes = 0; int *node_to_use; if (numa_available() < 0) { printf("no numa support in kernel\n"); exit(1); } maxnode = numa_max_node(); node_to_use = (int *)malloc(maxnode * sizeof(int)); for (a = 0; a <= maxnode; a++) { if (numa_bitmask_isbitset(numa_nodes_ptr, a)){ node_to_use[got_nodes++] = a; } } for (a = 0; a < got_nodes; a++){ printf("%03d: ", node_to_use[a]); if (numa_distance(node_to_use[a], node_to_use[a]) != 10) { printf("%d: self distance is not 10 (%d)\n", node_to_use[a], numa_distance(node_to_use[a],node_to_use[a])); exit(1); } for (b = 0; b < got_nodes; b++) { int d1 = numa_distance(node_to_use[a], node_to_use[b]); int d2 = numa_distance(node_to_use[b], node_to_use[a]); printf("%03d ", d1); if (d1 != d2) { printf("\n(%d,%d)->(%d,%d) wrong!\n",node_to_use[a],node_to_use[b],d1,d2); exit(1); } } printf("\n"); } return 0; } 07070100000040000081A40000000000000000000000016319106A00000084000000000000000000000000000000000000002600000000numactl- <sys/ipc.h> #include <stdio.h> int main(int ac, char **av) { while (*++av) printf("0x%x\n", ftok(*av, 0)); return 0; } 07070100000041000081A40000000000000000000000016319106A00000246000000000000000000000000000000000000002D00000000numactl- <sched.h> #include <sys/types.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include <numa.h> int main(int argc, char *argv[]) { nodemask_t nodemask; int rc, i; rc = numa_available(); printf("numa_available returns %d\n", rc); if (rc < 0) exit(1); nodemask_zero(&nodemask); nodemask = numa_get_run_node_mask(); for (i = 0; i < 4; i++) { printf("numa_get_run_node_mask nodemask_isset returns=0x%lx\n", nodemask_isset(&nodemask, i)); } rc = numa_run_on_node_mask(&nodemask); printf("rc=%d from numa_run_on_node_mask\n", rc); return (0); } 07070100000042000081A40000000000000000000000016319106A00000BF7000000000000000000000000000000000000003100000000numactl-* * Test program to test the moving of pages using mbind. * * (C) 2006 Silicon Graphics, Inc. * Christoph Lameter <> */ #include <stdio.h> #include <stdlib.h> #include <numa.h> #include <numaif.h> #include <unistd.h> #include <asm/unistd.h> unsigned int pagesize; unsigned int page_count = 32; char *page_base; char *pages; void **addr; int *status; int *nodes; int errors; int nr_nodes; struct bitmask *old_nodes; struct bitmask *new_nodes; int main(int argc, char **argv) { int i, rc; pagesize = getpagesize(); nr_nodes = numa_max_node()+1; old_nodes = numa_bitmask_alloc(nr_nodes); new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(old_nodes, 0); numa_bitmask_setbit(new_nodes, 1); if (nr_nodes < 2) { printf("A minimum of 2 nodes is required for this test.\n"); exit(1); } setbuf(stdout, NULL); printf("mbind migration test ......\n"); if (argc > 1) sscanf(argv[1], "%d", &page_count); page_base = malloc((pagesize + 1) * page_count); addr = malloc(sizeof(char *) * page_count); status = malloc(sizeof(int *) * page_count); nodes = malloc(sizeof(int *) * page_count); if (!page_base || !addr || !status || !nodes) { printf("Unable to allocate memory\n"); exit(1); } pages = (void *) ((((long)page_base) & ~((long)(pagesize - 1))) + pagesize); for (i = 0; i < page_count; i++) { if (i != 2) /* We leave page 2 unallocated */ pages[ i * pagesize ] = (char) i; addr[i] = pages + i * pagesize; nodes[i] = 0; status[i] = -123; } /* Move pages toi node zero */ numa_move_pages(0, page_count, addr, nodes, status, 0); printf("\nPage status before page migration\n"); printf("---------------------------------\n"); rc = numa_move_pages(0, page_count, addr, NULL, status, 0); if (rc < 0) { perror("move_pages"); exit(1); } for (i = 0; i < page_count; i++) { printf("Page %d vaddr=%p node=%d\n", i, pages + i * pagesize, status[i]); if (i != 2 && status[i]) { printf("Bad page state. Page %d status %d\n",i, status[i]); exit(1); } } /* Move to node zero */ printf("\nMoving pages via mbind to node 0 ...\n"); rc = mbind(pages, page_count * pagesize, MPOL_BIND, old_nodes->maskp, old_nodes->size + 1, MPOL_MF_MOVE | MPOL_MF_STRICT); if (rc < 0) { perror("mbind"); errors++; } printf("\nMoving pages via mbind from node 0 to 1 ...\n"); rc = mbind(pages, page_count * pagesize, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, MPOL_MF_MOVE | MPOL_MF_STRICT); if (rc < 0) { perror("mbind"); errors++; } numa_move_pages(0, page_count, addr, NULL, status, 0); for (i = 0; i < page_count; i++) { printf("Page %d vaddr=%lx node=%d\n", i, (unsigned long)(pages + i * pagesize), status[i]); if (i != 2) { if (pages[ i* pagesize ] != (char) i) { printf("*** Page content corrupted.\n"); errors++; } else if (status[i] != 1) { printf("*** Page on wrong node.\n"); errors++; } } } if (!errors) printf("Test successful.\n"); else printf("%d errors.\n", errors); return errors > 0 ? 1 : 0; } 07070100000043000081A40000000000000000000000016319106A00000B98000000000000000000000000000000000000002F00000000numactl-* * Test program to test the moving of a processes pages. * * (C) 2006 Silicon Graphics, Inc. * Christoph Lameter <> */ #include <stdio.h> #include <stdlib.h> #include <numa.h> #include <unistd.h> #include <errno.h> unsigned int pagesize; unsigned int page_count = 32; char *page_base; char *pages; void **addr; int *status; int *nodes; int errors; int nr_nodes; struct bitmask *old_nodes; struct bitmask *new_nodes; int main(int argc, char **argv) { int i, rc; pagesize = getpagesize(); nr_nodes = numa_max_node()+1; old_nodes = numa_bitmask_alloc(nr_nodes); new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(old_nodes, 1); numa_bitmask_setbit(new_nodes, 0); if (nr_nodes < 2) { printf("A minimum of 2 nodes is required for this test.\n"); exit(1); } setbuf(stdout, NULL); printf("migrate_pages() test ......\n"); if (argc > 1) sscanf(argv[1], "%d", &page_count); page_base = malloc((pagesize + 1) * page_count); addr = malloc(sizeof(char *) * page_count); status = malloc(sizeof(int *) * page_count); nodes = malloc(sizeof(int *) * page_count); if (!page_base || !addr || !status || !nodes) { printf("Unable to allocate memory\n"); exit(1); } pages = (void *) ((((long)page_base) & ~((long)(pagesize - 1))) + pagesize); for (i = 0; i < page_count; i++) { if (i != 2) /* We leave page 2 unallocated */ pages[ i * pagesize ] = (char) i; addr[i] = pages + i * pagesize; nodes[i] = 1; status[i] = -123; } /* Move to starting node */ rc = numa_move_pages(0, page_count, addr, nodes, status, 0); if (rc < 0 && errno != ENOENT) { perror("move_pages"); exit(1); } /* Verify correct startup locations */ printf("Page location at the beginning of the test\n"); printf("------------------------------------------\n"); numa_move_pages(0, page_count, addr, NULL, status, 0); for (i = 0; i < page_count; i++) { printf("Page %d vaddr=%p node=%d\n", i, pages + i * pagesize, status[i]); if (i != 2 && status[i] != 1) { printf("Bad page state before migrate_pages. Page %d status %d\n",i, status[i]); exit(1); } } /* Move to node zero */ numa_move_pages(0, page_count, addr, nodes, status, 0); printf("\nMigrating the current processes pages ...\n"); rc = numa_migrate_pages(0, old_nodes, new_nodes); if (rc < 0) { perror("numa_migrate_pages failed"); errors++; } /* Get page state after migration */ numa_move_pages(0, page_count, addr, NULL, status, 0); for (i = 0; i < page_count; i++) { printf("Page %d vaddr=%lx node=%d\n", i, (unsigned long)(pages + i * pagesize), status[i]); if (i != 2) { if (pages[ i* pagesize ] != (char) i) { printf("*** Page contents corrupted.\n"); errors++; } else if (status[i]) { printf("*** Page on the wrong node\n"); errors++; } } } if (!errors) printf("Test successful.\n"); else printf("%d errors.\n", errors); return errors > 0 ? 1 : 0; } 07070100000044000081A40000000000000000000000016319106A00000B35000000000000000000000000000000000000002C00000000numactl-* * Test program to test the moving of individual pages in a process. * * (C) 2006 Silicon Graphics, Inc. * Christoph Lameter <> */ #include <stdio.h> #include <stdlib.h> #include "numa.h" #include <unistd.h> #include <asm/unistd.h> unsigned int pagesize; unsigned int page_count = 32; char *page_base; char *pages; void **addr; int *status; int *nodes; int errors; int nr_nodes; int *node_to_use; int get_node_list() { int a, got_nodes = 0, max_node, numnodes; long long free_node_sizes; numnodes = numa_num_configured_nodes(); node_to_use = (int *)malloc(numnodes * sizeof(int)); max_node = numa_max_node(); for (a = 0; a <= max_node; a++) { if (numa_node_size(a, &free_node_sizes) > 0) node_to_use[got_nodes++] = a; } if(got_nodes != numnodes) return -1; return got_nodes; } int main(int argc, char **argv) { int i, rc; pagesize = getpagesize(); nr_nodes = get_node_list(); if (nr_nodes < 2) { printf("A minimum of 2 nodes is required for this test.\n"); exit(77); } if (nr_nodes == -1) { printf("Mismatch between congfigured nodes and memory-rich nodes.\n"); exit(1); } setbuf(stdout, NULL); printf("move_pages() test ......\n"); if (argc > 1) sscanf(argv[1], "%d", &page_count); printf("pages=%d (%s)\n", page_count, argv[1]); page_base = malloc((pagesize + 1) * page_count); addr = malloc(sizeof(char *) * page_count); status = malloc(sizeof(int *) * page_count); nodes = malloc(sizeof(int *) * page_count); if (!page_base || !addr || !status || !nodes) { printf("Unable to allocate memory\n"); exit(1); } pages = (void *) ((((long)page_base) & ~((long)(pagesize - 1))) + pagesize); for (i = 0; i < page_count; i++) { if (i != 2) /* We leave page 2 unallocated */ pages[ i * pagesize ] = (char) i; addr[i] = pages + i * pagesize; nodes[i] = node_to_use[(i % nr_nodes)]; status[i] = -123; } printf("\nMoving pages to start node ...\n"); rc = numa_move_pages(0, page_count, addr, NULL, status, 0); if (rc < 0) perror("move_pages"); for (i = 0; i < page_count; i++) printf("Page %d vaddr=%p node=%d\n", i, pages + i * pagesize, status[i]); printf("\nMoving pages to target nodes ...\n"); rc = numa_move_pages(0, page_count, addr, nodes, status, 0); if (rc < 0) { perror("move_pages"); errors++; } for (i = 0; i < page_count; i++) { if (i != 2) { if (pages[ i* pagesize ] != (char) i) errors++; else if (nodes[i] != node_to_use[(i % nr_nodes)]) errors++; } } for (i = 0; i < page_count; i++) { printf("Page %d vaddr=%lx node=%d\n", i, (unsigned long)(pages + i * pagesize), status[i]); } if (!errors) printf("Test successful.\n"); else printf("%d errors.\n", errors); return errors > 0 ? 1 : 0; } 07070100000045000081A40000000000000000000000016319106A0000010E000000000000000000000000000000000000002800000000numactl- <numa.h> #include <numaif.h> #include <stdio.h> int main(void) { int nd; char *man = numa_alloc(1000); *man = 1; if (get_mempolicy(&nd, NULL, 0, man, MPOL_F_NODE|MPOL_F_ADDR) < 0) perror("get_mempolicy"); else printf("my node %d\n", nd); return 0; } 07070100000046000081A40000000000000000000000016319106A00000178000000000000000000000000000000000000002C00000000numactl-* Test wrapper for the nodemask parser */ #include <stdio.h> #include "numa.h" #include "util.h" int main(int ac, char **av) { int err = 0; while (*++av) { struct bitmask *mask = numa_parse_nodestring(*av); if (!mask) { printf("Failed to convert `%s'\n", *av); err |= 1; continue; } printmask("result", mask); numa_bitmask_free(mask); } return err; } 07070100000047000081A40000000000000000000000016319106A00000243000000000000000000000000000000000000002900000000numactl- "numa.h" #include <stdio.h> #include <stdlib.h> int main(void) { int i, k, w, ncpus; struct bitmask *cpus; int maxnode = numa_num_configured_nodes()-1; if (numa_available() < 0) { printf("no numa\n"); exit(1); } cpus = numa_allocate_cpumask(); ncpus = cpus->size; for (i = 0; i <= maxnode ; i++) { if (numa_node_to_cpus(i, cpus) < 0) { printf("node %d failed to convert\n",i); } printf("%d: ", i); w = 0; for (k = 0; k < ncpus; k++) if (numa_bitmask_isbitset(cpus, k)) printf(" %s%d", w>0?",":"", k); putchar('\n'); } return 0; } 07070100000048000081ED0000000000000000000000016319106A0000009B000000000000000000000000000000000000002800000000numactl-!/bin/sh testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH exec "${builddir}"/numademo -t -e 10M 07070100000049000081A40000000000000000000000016319106A00000066000000000000000000000000000000000000002A00000000numactl- <unistd.h> #include <stdio.h> int main(void) { printf("%d\n", getpagesize()); return 0; } 0707010000004A000081A40000000000000000000000016319106A00000A17000000000000000000000000000000000000002A00000000numactl-* Test prefer policy */ #include "numa.h" #include "numaif.h" #include <sys/mman.h> #include <stdio.h> #include <assert.h> #include <unistd.h> #include <stdlib.h> #include <errno.h> #define err(x) perror(x),exit(1) extern void printmask(char *name, struct bitmask *mask); int main(void) { int max = numa_max_node(); int maxmask = numa_num_possible_nodes(); struct bitmask *nodes, *mask; int pagesize = getpagesize(); int i; int pol; int node; int err = 0; nodes = numa_bitmask_alloc(maxmask); mask = numa_bitmask_alloc(maxmask); /* Step 1. test 'preferred' policy */ printf("\nTesting MPOL_PREFERRED policy:\n\n"); for (i = max; i >= 0; --i) { char *mem = mmap(NULL, pagesize*(max+1), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); char *adr = mem; if (mem == (char *)-1) err("mmap"); printf("%d offset %lx\n", i, (long)(adr - mem)); numa_bitmask_clearall(nodes); numa_bitmask_clearall(mask); numa_bitmask_setbit(nodes, i); if (mbind(adr, pagesize, MPOL_PREFERRED, nodes->maskp, nodes->size, 0) < 0) err("mbind"); ++*adr; if (get_mempolicy(&pol, mask->maskp, mask->size, adr, MPOL_F_ADDR) < 0) err("get_mempolicy"); assert(pol == MPOL_PREFERRED); assert(numa_bitmask_isbitset(mask, i)); node = 0x123; if (get_mempolicy(&node, NULL, 0, adr, MPOL_F_ADDR|MPOL_F_NODE) < 0) err("get_mempolicy2"); printf("got node %d expected %d\n", node, i); if (node != i) err = 1; } /* Step 2. test 'preferred-many' policy */ if (max < 1) return err; printf("\nTesting MPOL_PREFERRED_MANY policy:\n\n"); for (i = max; i >= 1; --i) { char *mem = mmap(NULL, pagesize*(max+1), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); char *adr = mem; if (mem == (char *)-1) err("mmap"); numa_bitmask_clearall(nodes); numa_bitmask_clearall(mask); /* Set 2 nodes */ numa_bitmask_setbit(nodes, i); numa_bitmask_setbit(nodes, i - 1); if (mbind(adr, pagesize, MPOL_PREFERRED_MANY, nodes->maskp, nodes->size, 0) < 0) err("mbind"); ++*adr; if (get_mempolicy(&pol, mask->maskp, mask->size, adr, MPOL_F_ADDR) < 0) err("get_mempolicy"); assert(pol == MPOL_PREFERRED_MANY); printmask("Got nodes", mask); printmask("Expected nodes", nodes); if (!numa_bitmask_equal(mask, nodes)) err = 1; node = 0x123; if (get_mempolicy(&node, NULL, 0, adr, MPOL_F_ADDR|MPOL_F_NODE) < 0) err("get_mempolicy2"); printf("Got node: %d ", node); printmask("Expected nodes", nodes); if (!numa_bitmask_isbitset(nodes, node)) err = 1; } return err; } 0707010000004B000081ED0000000000000000000000016319106A00000064000000000000000000000000000000000000002800000000numactl-!/bin/bash #print cpu it is running on declare -a arr arr=( $(< /proc/self/stat) ) echo ${arr[38]} 0707010000004C000081A40000000000000000000000016319106A00000E96000000000000000000000000000000000000002900000000numactl-* Randomly change policy */ #include <stdio.h> #include "numa.h" #include "numaif.h" #include <sys/mman.h> #include <sys/shm.h> #include <sys/ipc.h> #include <stdlib.h> #include <time.h> #include <unistd.h> #include <string.h> #include <errno.h> #define SIZE (100*1024*1024) #define PAGES (SIZE/pagesize) #define perror(x) printf("%s: %s\n", x, strerror(errno)) #define err(x) perror(x),exit(1) struct page { unsigned long mask; int policy; }; struct page *pages; char *map; int pagesize; void setpol(unsigned long offset, unsigned long length, int policy, unsigned long nodes) { long i, end; printf("off:%lx length:%lx policy:%d nodes:%lx\n", offset, length, policy, nodes); if (mbind(map + offset*pagesize, length*pagesize, policy, &nodes, 8, 0) < 0) { printf("mbind: %s offset %lx length %lx policy %d nodes %lx\n", strerror(errno), offset*pagesize, length*pagesize, policy, nodes); return; } for (i = offset; i < offset+length; i++) { pages[i].mask = nodes; pages[i].policy = policy; } i = offset - 20; if (i < 0) i = 0; end = offset+length+20; if (end > PAGES) end = PAGES; for (; i < end; i++) { int pol2; unsigned long nodes2; if (get_mempolicy(&pol2, &nodes2, sizeof(long)*8, map+i*pagesize, MPOL_F_ADDR) < 0) err("get_mempolicy"); if (pol2 != pages[i].policy) { printf("%lx: got policy %d expected %d, nodes got %lx expected %lx\n", i, pol2, pages[i].policy, nodes2, pages[i].mask); } if (policy != MPOL_DEFAULT && nodes2 != pages[i].mask) { printf("%lx: nodes %lx, expected %lx, policy %d\n", i, nodes2, pages[i].mask, policy); } } } static unsigned char pop4[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; int popcnt(unsigned long val) { int count = 0; while (val) { count += pop4[val & 0xf]; val >>= 4; } return count; } void testmap(void) { pages = calloc(1, PAGES * sizeof(struct page)); if (!pages) exit(100); printf("simple tests\n"); #define MB ((1024*1024)/pagesize) setpol(0, PAGES, MPOL_INTERLEAVE, 3); setpol(0, MB, MPOL_BIND, 1); setpol(MB, MB, MPOL_BIND, 1); setpol(MB, MB, MPOL_DEFAULT, 0); setpol(MB, MB, MPOL_PREFERRED, 2); setpol(MB/2, MB, MPOL_DEFAULT, 0); setpol(MB+MB/2, MB, MPOL_BIND, 2); setpol(MB/2+100, 100, MPOL_PREFERRED, 1); setpol(100, 200, MPOL_PREFERRED, 1); printf("done\n"); for (;;) { unsigned long offset = random() % PAGES; int policy = random() % (MPOL_MAX); unsigned long nodes = random() % 4; long length = random() % (PAGES - offset); /* validate */ switch (policy) { case MPOL_DEFAULT: nodes = 0; break; case MPOL_INTERLEAVE: case MPOL_BIND: if (nodes == 0) continue; break; case MPOL_PREFERRED: if (popcnt(nodes) != 1) continue; break; } setpol(offset, length, policy, nodes); } } int main(int ac, char **av) { unsigned long seed; pagesize = getpagesize(); #if 0 map = mmap(NULL, SIZE, PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); if (map == (char*)-1) err("mmap"); #else int shmid = shmget(IPC_PRIVATE, SIZE, IPC_CREAT|0666); if (shmid < 0) err("shmget"); map = shmat(shmid, NULL, SHM_RDONLY); shmctl(shmid, IPC_RMID, NULL); if (map == (char *)-1) err("shmat"); printf("map %p\n", map); #endif if (av[1]) { char *end; unsigned long timeout = strtoul(av[1], &end, 0); switch (*end) { case 'h': timeout *= 3600; break; case 'm': timeout *= 60; break; } printf("running for %lu seconds\n", timeout); alarm(timeout); } else printf("running forever\n"); if (av[1] && av[2]) seed = strtoul(av[2], 0, 0); else seed = time(0); printf("random seed %lu\n", seed); srandom(seed); testmap(); /* test shm etc. */ return 0; } 0707010000004D000081A40000000000000000000000016319106A00000992000000000000000000000000000000000000002E00000000numactl- <assert.h> #include <errno.h> #include <limits.h> #include <unistd.h> #include <stdlib.h> #include <stdio.h> #include <sys/mman.h> #include "numa.h" #include "numaif.h" #define DEFAULT_NR_PAGES 1024 static int parse_int(const char *str) { char *endptr; long ret = strtol(str, &endptr, 0); if (*endptr != '\0') { fprintf(stderr, "[error] strtol() failed: parse error: %s\n", endptr); exit(1); } if (errno == ERANGE) fprintf(stderr, "[warning] strtol() out of range\n"); if (ret > INT_MAX || ret < INT_MIN) { fprintf(stderr, "[warning] parse_int() out of range\n"); ret = (ret > 0) ? INT_MAX : INT_MIN; } return (int) ret; } int main(int argc, char **argv) { char *mem; int page_size = numa_pagesize(); int node = 0; int nr_pages = DEFAULT_NR_PAGES; if (numa_available() < 0) { fprintf(stderr, "numa is not available"); exit(1); } if (argc > 1) node = parse_int(argv[1]); if (argc > 2) nr_pages = parse_int(argv[2]); mem = numa_alloc_onnode(page_size, node); /* Store the policy of the newly allocated area */ unsigned long nodemask; int mode; int nr_nodes = numa_num_possible_nodes(); if (get_mempolicy(&mode, &nodemask, nr_nodes, mem, MPOL_F_NODE | MPOL_F_ADDR) < 0) { perror("get_mempolicy() failed"); exit(1); } /* Print some info */ printf("Page size: %d\n", page_size); printf("Pages realloc'ed: %d\n", nr_pages); printf("Allocate data in node: %d\n", node); int i; int nr_inplace = 0; int nr_moved = 0; for (i = 0; i < nr_pages; i++) { /* Enlarge mem with one more page */ char *new_mem = numa_realloc(mem, (i+1)*page_size, (i+2)*page_size); if (!new_mem) { perror("numa_realloc() failed"); exit(1); } if (new_mem == mem) ++nr_inplace; else ++nr_moved; mem = new_mem; /* Check the policy of the realloc'ed area */ unsigned long realloc_nodemask; int realloc_mode; if (get_mempolicy(&realloc_mode, &realloc_nodemask, nr_nodes, mem, MPOL_F_NODE | MPOL_F_ADDR) < 0) { perror("get_mempolicy() failed"); exit(1); } assert(realloc_nodemask == nodemask && realloc_mode == mode && "policy changed"); } /* Shrink to the original size */ mem = numa_realloc(mem, (nr_pages + 1)*page_size, page_size); if (!mem) { perror("numa_realloc() failed"); exit(1); } numa_free(mem, page_size); printf("In-place reallocs: %d\n", nr_inplace); printf("Moved reallocs: %d\n", nr_moved); return 0; } 0707010000004E000081ED0000000000000000000000016319106A000014C2000000000000000000000000000000000000002700000000numactl-!/bin/bash # simple regression test for numactl/numaapi # must be run from 'test' directory of numactl source package, # after build [just use 'make test'] # note the statistics checks may fail when the system is under # memory pressure # Copyright 2003,2004 Andi Kleen, SuSE Labs. testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH : ${NUMACTL:=${builddir}/numactl} VALGRIND=${VALGRIND:-} MB=$[1024*1024] SIZE=$[15 * $MB] DEMOSIZE=$[10 * $MB] STAT_INTERVAL=5 PAGESIZE=$("${builddir}/test/pagesize") PAGES=$[ $SIZE / $PAGESIZE ] HALFPAGES=$[ $PAGES / 2 ] HALFPAGES=$[ $HALFPAGES - 100 ] DOUBLEPAGES=$[ $PAGES * 2 ] DOUBLEPAGES=$[ $DOUBLEPAGES - 200 ] NEEDPAGES=$[ $DOUBLEPAGES + $DOUBLEPAGES / 5 ] # 20% spare EXIT=0 declare -i maxnode declare -a node declare -a nlist # ===================================================================== numactl() { $VALGRIND $NUMACTL "$@" } failed() { echo '=======FAILED' echo "Check if machine doesn't have background jobs and try again" EXIT=1 } # nstat statname node nstat() { sleep $STAT_INTERVAL nid=node$2 id=`numastat | head -1 | awk -v node=$nid '{ for (i = 1; i <= NF; ++i) if($i==node) print i; exit }'` declare -a fields numastat | grep $1 | while read -a fields ; do echo ${fields[$id]} done } probe_hardware() { declare -i n=0 numnodes=$(numactl --hardware | awk '/^available/ { print $2 }') maxnode=$(expr $numnodes - 1) nlist=( $(numactl --hardware | grep "^node" | tail -1 |awk '{$1=""; print }') ) # find nodes with at least NEEDPAGES of free memory for i in $(seq 0 $maxnode) ; do free=$(numactl --hardware | fgrep " ${nlist[$i]} free" | awk '{print $4}') free=$(( free * MB )) if [[ $((free / PAGESIZE)) -ge $NEEDPAGES ]]; then node[$n]=${nlist[$i]} n=$((n + 1 )) fi done numnodes=$n maxnode=$(expr $numnodes - 1) if [ $numnodes -lt 2 ] ; then echo "need at least two nodes with at least $NEEDPAGES each of" echo "free memory for mempolicy regression tests" exit 77 # Skip test fi } # ========================================================================= _test_process_state() { echo '=>testing numactl' "$@" "memhog -H $SIZE" numactl "$@" memhog -H $SIZE || failed } test_process_state() { declare -i n0=${node[0]} n1=${node[1]} _test_process_state --interleave=$n1 a0=`nstat interleave_hit $n0` a1=`nstat interleave_hit $n1` _test_process_state --interleave=$n0,$n1 b0=`nstat interleave_hit $n0` b1=`nstat interleave_hit $n1` if [ $(expr $b1 - $a1) -lt $HALFPAGES ]; then echo "interleaving test failed $n1 $b1 $a1" failed fi if [ $(expr $b0 - $a0) -lt $HALFPAGES ]; then echo "interleaving test failed $n0 $b0 $a0" failed fi _test_process_state --interleave=all _test_process_state --membind=all a=$(expr $(nstat numa_hit $n0) + $(nstat numa_hit $n1)) _test_process_state --membind=$n0,$n1 b=$(expr $(nstat numa_hit $n0) + $(nstat numa_hit $n1)) if [ $(expr $b - $a) -lt $PAGES ]; then echo "membind test failed $n1 $b $a ($PAGES)" failed fi for i in "${node[@]}" ; do a=`nstat numa_hit $i` _test_process_state --membind=$i _test_process_state --preferred=$i b=`nstat numa_hit $i` if [ $(expr $b - $a) -lt $DOUBLEPAGES ]; then echo "membind/preferred on node $ni failed $b $a" failed fi done _test_process_state --localalloc } # ========================================================================= # test mbind _test_mbind() { echo '=>testing memhog -H' "$@" memhog -H $SIZE "$@" || failed } test_mbind() { declare -i n0=${node[0]} n1=${node[1]} a0=`nstat interleave_hit $n0` a1=`nstat interleave_hit $n1` _test_mbind interleave $n0,$n1 b0=`nstat interleave_hit $n0` b1=`nstat interleave_hit $n1` if [ $(expr $b1 - $a1) -lt $HALFPAGES ]; then echo "interleaving test 2 failed $n1 $b1 $a1 expected $HALFPAGES" failed fi if [ $(expr $b0 - $a0) -lt $HALFPAGES ]; then echo "interleaving test 2 failed $n0 $b0 $a0" failed fi _test_mbind interleave all a=$(expr $(nstat numa_hit $n0) + $(nstat numa_hit $n1)) _test_mbind membind $n0,$n1 b=$(expr $(nstat numa_hit $n0) + $(nstat numa_hit $n1)) if [ $(expr $b - $a) -lt $PAGES ]; then echo "membind test 2 failed $b $a ($PAGES)" failed fi for i in "${node[@]}" ; do declare -i ni=${node[$i]} a=`nstat numa_hit $i` _test_mbind membind $i _test_mbind preferred $i b=`nstat numa_hit $i` if [ $(expr $b - $a) -lt $DOUBLEPAGES ]; then echo "membind/preferred test 2 on node $ni failed $b $a" failed fi done } # ========================================================================= main() { # Get the interval vm statistics refresh at if [ -e /proc/sys/vm/stat_interval ]; then STAT_INTERVAL=`cat /proc/sys/vm/stat_interval` STAT_INTERVAL=`expr $STAT_INTERVAL \* 2` fi probe_hardware numactl --cpubind=${node[0]} /bin/true numactl --cpubind=${node[1]} /bin/true numactl -s numactl --hardware numastat > A test_process_state test_mbind numastat > B diff -u A B rm A B if [ "$EXIT" = 0 ] ; then echo '========SUCCESS' else echo '========FAILURE' exit 1 fi } # ========================================================================= main 0707010000004F000081A40000000000000000000000016319106A00000394000000000000000000000000000000000000002A00000000numactl-!/bin/bash # test IO affinity parsing # tests may fail depending on machine setup testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH E=0 check() { echo testing $@ if "$@" ; then true else echo failed E=1 fi } fail() { echo testing failure of $@ if "$@" ; then echo failed E=1 else true fi } check "${builddir}/test/node-parse" file:. check "${builddir}/test/node-parse" ip: fail "${builddir}/test/node-parse" ip: IF=$(ip link ls | grep eth | cut -d: -f2 | head -1) check "${builddir}/test/node-parse" "netdev:$IF" fail "${builddir}/test/node-parse" netdev:lo DEV=$(df | awk '/\/$/ { print $1 }') check "${builddir}/test/node-parse" file:$DEV check "${builddir}/test/node-parse" block:$(basename $DEV) check "${builddir}/test/node-parse" pci:0:0.0 if [ "$E" = 0 ] ; then echo SUCCESS ; else echo FAILURE ; fi exit $E 07070100000050000081ED0000000000000000000000016319106A00000232000000000000000000000000000000000000002800000000numactl-!/bin/sh # More regression tests for libnuma/numa api VALGRIND=${VALGRIND:-} testdir=`dirname "$0"` : ${srcdir:=${testdir}/..} : ${builddir:=${srcdir}} export PATH=${builddir}:$PATH T() { echo "$@" if ! $VALGRIND "$@" ; then echo $1 FAILED!!!! exit 1 fi echo } # still broken #T "${builddir}/test/prefered" T "${builddir}/test/distance" T "${builddir}/test/nodemap" T "${srcdir}/test/checkaffinity" T "${srcdir}/test/checktopology" T "${builddir}/test/tbitmap" T "${srcdir}/test/bind_range" #T "${builddir}/test/randmap" 07070100000051000081ED0000000000000000000000016319106A000002D9000000000000000000000000000000000000002600000000numactl-!/bin/sh # run the Linux Test Project with various numactl settings. will run for a few hours. # must run as root # You can download LTP from # Change LTP below to the source directory of a compiled LTP distribution LTP=/src/ltp LEN=2h LTPOPT="-q -p -t $LEN" export PATH=`pwd`/..:$PATH cd $LTP for i in 1 2 3 ; do numactl --interleave=all ./runltp $LTPOPT -l n.interleave.all.$i numactl --interleave=0,1 ./runltp $LTPOPT -l n.interleave.01.$i numactl --preferred=0 --cpubind=1 ./runltp $LTPOPT -l n.preferred.$i # the VM test that allocates all memory may fail numactl --membind=1 --cpubind=0 ./runltp $LTPOPT -l n.membind1.$i numactl --membind=0,1 ./runltp $LTPOPT -l n.membind01.$i done 07070100000052000081ED0000000000000000000000016319106A0000060D000000000000000000000000000000000000002700000000numactl-!/bin/sh # basic shared memory policy test # hugetlbfs and tmpfs must be mounted on these mount points TMPFS=/dev/shm HUGE=/huge #valgrind 3.0.1 doesn't implement mbind() yet on x86-64 #VALGRIND="valgrind --tool=memcheck" VALGRIND= set -e export PATH=`pwd`/..:$PATH numactl() { $VALGRIND ../numactl "$@" } failure() { numastat > after set +e diff -u before after echo echo TEST FAILED exit 1 } success() { echo test succeeded } checkpoint() { numastat > before } trap failure EXIT basictest() { echo initial checkpoint numactl --length=20m $1 --dump echo interleave checkpoint numactl --offset=2m --length=2m $1 --strict --interleave=0,1 --verify --dump echo interleave verify checkpoint numactl $1 --dump echo membind setup checkpoint numactl --offset 4m --length=2m $1 --strict --membind=1 --verify --dump echo membind verify checkpoint numactl $1 --dump echo preferred setup checkpoint numactl --offset 6m --length 2m $1 --strict --preferred=1 --verify --dump echo preferred verify checkpoint numactl $1 --dump # check overlaps here } cleanupshm() { if [ -f $1 ] ; then ipcrm -M `./ftok $1` || true rm $1 fi } banner() { echo echo ++++++++++++ $1 +++++++++++++++ echo } banner shm cleanupshm A basictest --shm=A cleanupshm A banner hugeshm cleanupshm B basictest "--huge --shm=B" cleanupshm B banner tmpfs basictest "--file $TMPFS/B" rm $TMPFS/B # first need a way to create holey hugetlbfs files. #banner hugetlbfs #basictest "--file $HUGE/B" #rm /hugetlbfs/B rm before trap success EXIT 07070100000053000081A40000000000000000000000016319106A00000B61000000000000000000000000000000000000002900000000numactl-* Unit test bitmap parser */ #define _GNU_SOURCE 1 //#include <asm/bitops.h> #include <stdio.h> #include <string.h> #include <assert.h> #include <stdlib.h> #include <ctype.h> #include "numa.h" #include "util.h" #define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) #define test_bit(i,p) ((p)[(i) / BITS_PER_LONG] & (1UL << ((i)%BITS_PER_LONG))) #define set_bit(i,p) ((p)[(i) / BITS_PER_LONG] |= (1UL << ((i)%BITS_PER_LONG))) #define clear_bit(i,p) ((p)[(i) / BITS_PER_LONG] &= ~(1UL << ((i)%BITS_PER_LONG))) typedef unsigned u32; #define BITS_PER_LONG (sizeof(long)*8) #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1)) #define CPU_BYTES(x) (round_up(x, BITS_PER_LONG)/8) #define CPU_LONGS(x) (CPU_BYTES(x) / sizeof(long)) /* Following routine extracted from Linux 2.6.16 */ #define CHUNKSZ 32 #define nbits_to_hold_value(val) fls(val) #define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) #define BASEDEC 10 /* fancier cpuset lists input in decimal */ /** * bitmap_scnprintf - convert bitmap to an ASCII hex string. * @buf: byte buffer into which string is placed * @buflen: reserved size of @buf, in bytes * @mask: pointer to struct bitmask to convert * * Hex digits are grouped into comma-separated sets of eight digits per set. */ int bitmap_scnprintf(char *buf, unsigned int buflen, struct bitmask *mask) { int i, word, bit, len = 0; unsigned long val; const char *sep = ""; int chunksz; u32 chunkmask; chunksz = mask->size & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = ALIGN(mask->size, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (mask->maskp[word] >> bit) & chunkmask; len += snprintf(buf+len, buflen-len, "%s%0*lx", sep, (chunksz+3)/4, val); chunksz = CHUNKSZ; sep = ","; } return len; } extern int numa_parse_bitmap(char *buf, struct bitmask *mask); #define MASKSIZE 300 int main(void) { char buf[1024]; struct bitmask *mask, *mask2; int i; mask = numa_bitmask_alloc(MASKSIZE); mask2 = numa_bitmask_alloc(MASKSIZE); printf("Testing bitmap functions\n"); for (i = 0; i < MASKSIZE; i++) { numa_bitmask_clearall(mask); numa_bitmask_clearall(mask2); numa_bitmask_setbit(mask, i); assert(find_first(mask) == i); bitmap_scnprintf(buf, sizeof(buf), mask); strcat(buf,"\n"); if (numa_parse_bitmap(buf, mask2) < 0) assert(0); if (memcmp(mask->maskp, mask2->maskp, numa_bitmask_nbytes(mask))) { bitmap_scnprintf(buf, sizeof(buf), mask2); printf("mask2 differs: %s\n", buf); assert(0); } } printf("Passed\n"); return 0; } 07070100000054000081A40000000000000000000000016319106A000003A7000000000000000000000000000000000000002900000000numactl- <numa.h> #include <numaif.h> #include <stdio.h> #include <stdlib.h> #include <sys/wait.h> #include <unistd.h> #define err(x) perror(x),exit(1) enum SZ { MEMSZ = 100<<20, NTHR = 10, }; /* test if shared interleaving state works. */ int main(void) { int i, k; char *mem; int pagesz = getpagesize(); int max_node; if (numa_available() < 0) { printf("no NUMA API available\n"); exit(1); } max_node = numa_max_node(); mem = numa_alloc_interleaved(MEMSZ); for (i = 0; i < NTHR; i++) { if (fork() == 0) { for (k = i*pagesz; k < MEMSZ; k += pagesz * NTHR) { mem[k] = 1; } _exit(0); } } for (i = 0; i < NTHR; i++) wait(NULL); k = 0; for (i = 0; i < MEMSZ; i += pagesz) { int nd; if (get_mempolicy(&nd, NULL, 0, mem + i, MPOL_F_NODE|MPOL_F_ADDR) < 0) err("get_mempolicy"); if (nd != k) printf("offset %d node %d expected %d\n", i, nd, k); k = (k+1)%(max_node+1); } return 0; } 07070100000055000081A40000000000000000000000016319106A0000082D000000000000000000000000000000000000002600000000numactl- <sys/shm.h> #include <sys/ipc.h> #include <sys/fcntl.h> #include <stdio.h> #include <numaif.h> #define err(x) perror(x),exit(1) enum { MEMSZ = 10*1024*1024, }; struct req { enum cmd { SET = 1, CHECK, REPLY, EXIT, } cmd; long offset; long len; int policy; nodemask_t nodes; }; void worker(void) { struct req req; while (read(0, &req, sizeof(struct req) > 0)) { switch (req.cmd) { case SET: if (mbind(map + req.offset, req.len, req.policy, &req.nodes, NUMA_MAX_NODES+1, 0) < 0) err("mbind"); break; case TEST: req.cmd = REPLY; if (get_mempolicy(&req.policy, &req.nodes, NUMA_MAX_NODES+1, map + req.offset, MPOL_F_ADDR) < 0) err("get_mempolicy"); write(1, &req, sizeof(struct req)); break; case EXIT: return; default: abort(); } } } void sendreq(int fd, enum cmd cmd, int policy, long offset, long len, nodemask_t nodes) { struct req req = { .cmd = cmd, .offset = offset, .len = len, .policy = policy, .nodes = nodes }; if (write(fd, &req, sizeof(struct req)) != sizeof(struct req)) panic("bad req write"); } void readreq(int fd, int *policy, nodemask_t *nodes, long offset, long len) { struct req req; if (read(fd, &req, sizeof(struct req)) != sizeof(struct req)) panic("bad req read"); if (req.cmd != REPLY) abort(); *policy = req.policy; *nodes = req.nodes; } int main(void) { int fd = open("tshm", O_CREAT, 0600); close(fd); key_t key = ftok("tshm", 1); int shm = shmget(key, MEMSZ, IPC_CREAT|0600); if (shm < 0) err("shmget"); char *map = shmat(shm, NULL, 0); printf("map = %p\n", map); unsigned long nmask = 0x3; if (mbind(map, MEMSZ, MPOL_INTERLEAVE, &nmask, 4, 0) < 0) err("mbind1"); int fd[2]; if (pipe(fd) < 0) err("pipe"); if (fork() == 0) { close(0); close(1); dup2(fd[0], 0); dup2(fd[1], 1); worker(); _exit(0); } int pagesz = getpagesize(); int i; srand(1); for (;;) { /* chose random offset */ /* either in child or here */ /* change policy */ /* ask other guy to check */ } shmdt(map); shmctl(shm, IPC_RMID, 0); } 07070100000056000081A40000000000000000000000016319106A00000BB9000000000000000000000000000000000000002100000000numactl-* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs. numactl is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2. numactl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should find a copy of v2 of the GNU General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "numa.h" #include "numaif.h" #include "util.h" #include <stdio.h> #include <string.h> #include <stdlib.h> #include <stdarg.h> #include <ctype.h> #include <errno.h> #include <unistd.h> void printmask(char *name, struct bitmask *mask) { int i; printf("%s: ", name); for (i = 0; i < mask->size; i++) if (numa_bitmask_isbitset(mask, i)) printf("%d ", i); putchar('\n'); } int find_first(struct bitmask *mask) { int i; for (i = 0; i < mask->size; i++) if (numa_bitmask_isbitset(mask, i)) return i; return -1; } void complain(char *fmt, ...) { va_list ap; va_start(ap, fmt); fprintf(stderr, "numactl: "); vfprintf(stderr,fmt,ap); putchar('\n'); va_end(ap); exit(1); } void nerror(char *fmt, ...) { int err = errno; va_list ap; va_start(ap,fmt); fprintf(stderr, "numactl: "); vfprintf(stderr, fmt, ap); va_end(ap); if (err) fprintf(stderr,": %s\n", strerror(err)); else fputc('\n', stderr); exit(1); } long memsize(char *s) { char *end; long length = strtoul(s,&end,0); switch (toupper(*end)) { case 'G': length *= 1024; /*FALL THROUGH*/ case 'M': length *= 1024; /*FALL THROUGH*/ case 'K': length *= 1024; break; } return length; } static struct policy { char *name; int policy; int noarg; } policies[] = { { "preferred-many", MPOL_PREFERRED_MANY, }, { "local", MPOL_LOCAL, 1 }, { "interleave", MPOL_INTERLEAVE, }, { "membind", MPOL_BIND, }, { "preferred", MPOL_PREFERRED, }, { "default", MPOL_DEFAULT, 1 }, { NULL }, }; static char *policy_names[] = { "default", "preferred", "bind", "interleave", "local", "preferred-many" }; char *policy_name(int policy) { static char buf[32]; if (policy >= array_len(policy_names)) { sprintf(buf, "[%d]", policy); return buf; } return policy_names[policy]; } int parse_policy(char *name, char *arg) { int k; struct policy *p = NULL; int found = 0; if (!name) return MPOL_DEFAULT; while (*name == '-') name++; for (k = 0; policies[k].name; k++) { p = &policies[k]; if (!strcmp(p->name, name)) { found = 1; break; } } if (!found || !p || !p->name || (!arg && !p->noarg)) return MPOL_MAX; return p->policy; } void print_policies(void) { int i; printf("Policies:"); for (i = 0; policies[i].name; i++) printf(" %s", policies[i].name); printf("\n"); } 07070100000057000081A40000000000000000000000016319106A000002E5000000000000000000000000000000000000002100000000numactl- void printmask(char *name, struct bitmask *mask); extern int find_first(struct bitmask *mask); extern struct bitmask *nodemask(char *s); extern struct bitmask *cpumask(char *s, int *ncpus); extern int read_sysctl(char *name); extern void complain(char *fmt, ...); extern void nerror(char *fmt, ...); extern long memsize(char *s); extern int parse_policy(char *name, char *arg); extern void print_policies(void); extern char *policy_name(int policy); #define err(x) perror("numactl: " x),exit(1) #define array_len(x) (sizeof(x)/sizeof(*(x))) #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1)) #if HAVE_ATTRIBUTE_SYMVER #define SYMVER(a,b) __attribute__ ((symver (b))) #else #define SYMVER(a,b) __asm__ (".symver " a "," b); #endif 07070100000058000081A40000000000000000000000016319106A00000FAE000000000000000000000000000000000000002C00000000numactl- Symbols defined in the library which aren't specifically bound to a # version node are effectively bound to an unspecified base version of # the library. It is possible to bind all otherwise unspecified symbols # to a given version node using `global: *' somewhere in the version script. # # The interfaces at the "v1" level. # At this level we present these functions to the linker (and thus to an # application). # Any functions not defined in the global list (i.e. "local") will be internal # to the library (i.e. not exported but used within the library). # Thus the real function names, "numa_bind_v1" etc, are local and won't # be known to the linker. # the first 16 have v1 aliases # 3 of the 5 system calls that libnuma provides are common to all versions: libnuma_1.1 { global: set_mempolicy; get_mempolicy; mbind; numa_all_nodes; numa_alloc; numa_alloc_interleaved; numa_alloc_interleaved_subset; numa_alloc_local; numa_alloc_onnode; numa_available; numa_bind; numa_distance; numa_error; numa_exit_on_error; numa_free; numa_get_interleave_mask; numa_get_interleave_node; numa_get_membind; numa_get_run_node_mask; numa_interleave_memory; numa_max_node; numa_migrate_pages; numa_no_nodes; numa_node_size64; numa_node_size; numa_node_to_cpus; numa_pagesize; numa_parse_bitmap; numa_police_memory; numa_preferred; numa_run_on_node; numa_run_on_node_mask; numa_sched_getaffinity; numa_sched_setaffinity; numa_set_bind_policy; numa_set_interleave_mask; numa_set_localalloc; numa_set_membind; numa_set_preferred; numa_set_strict; numa_setlocal_memory; numa_tonode_memory; numa_tonodemask_memory; numa_warn; numa_exit_on_warn; numa_node_to_cpu_update; local: *; }; # The interfaces at the "v2" level. # The first 17 have v2 aliases # We add the bitmask_ functions # and the move_pages and migrate_pages system calls # 1.2 depends on 1.1 libnuma_1.2 { global: copy_bitmask_to_nodemask; copy_nodemask_to_bitmask; copy_bitmask_to_bitmask; move_pages; migrate_pages; numa_all_cpus_ptr; numa_all_nodes_ptr; numa_alloc_interleaved_subset; numa_realloc; numa_allocate_cpumask; numa_allocate_nodemask; numa_bind; numa_bitmask_alloc; numa_bitmask_clearall; numa_bitmask_clearbit; numa_bitmask_equal; numa_bitmask_free; numa_bitmask_isbitset; numa_bitmask_nbytes; numa_bitmask_setall; numa_bitmask_setbit; numa_bitmask_weight; numa_get_interleave_mask; numa_get_membind; numa_get_mems_allowed; numa_get_run_node_mask; numa_interleave_memory; numa_max_possible_node; numa_move_pages; numa_no_nodes_ptr; numa_node_to_cpus; numa_node_of_cpu; numa_nodes_ptr; numa_num_configured_cpus; numa_num_configured_nodes; numa_num_possible_nodes; numa_num_task_cpus; numa_num_task_nodes; numa_num_thread_cpus; numa_num_thread_nodes; numa_parse_bitmap; numa_parse_cpustring; numa_parse_nodestring; numa_run_on_node_mask; numa_sched_getaffinity; numa_sched_setaffinity; numa_set_interleave_mask; numa_set_membind; numa_tonodemask_memory; local: *; } libnuma_1.1; # New parsing interface for cpu/numastrings # was added into version 1.3 libnuma_1.3 { global: numa_parse_cpustring_all; numa_parse_nodestring_all; numa_num_possible_cpus; local: *; } libnuma_1.2; # New interface with customizable cpuset awareness # was added into version 1.4 libnuma_1.4 { global: numa_run_on_node_mask_all; local: *; } libnuma_1.3; # New interface for membind with NUMA balancing optimization libnuma_1.5 { global: numa_set_membind_balancing; local: *; } libnuma_1.4; libnuma_1.6{ global: numa_has_preferred_many; numa_set_preferred_many; numa_preferred_many; local: *; } libnuma_1.5; 07070100000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000B00000000TRAILER!!!814 blocks
