Project import generated by Copybara. GitOrigin-RevId: 148c095e787643f7fb66cab935bd71ff08fe54ca
diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a434cbd --- /dev/null +++ b/.travis.yml
@@ -0,0 +1,11 @@ +language: c +dist: trusty + +compiler: + - clang + - gcc + +script: ./autogen.sh && ./configure && make && make check + +after_script: cat ./tests/runoneshot.sh.log +
diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..3cbb8a0 --- /dev/null +++ b/AUTHORS
@@ -0,0 +1,3 @@ +Arjen Van De Ven <arjanvandeven@gmail.com> +Neil Horman <nhorman@gmail.com> +
diff --git a/Android.mk b/Android.mk new file mode 100644 index 0000000..74bef5e --- /dev/null +++ b/Android.mk
@@ -0,0 +1,22 @@ +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) + +LOCAL_SRC_FILES:= \ + activate.c \ + bitmap.c \ + classify.c \ + cputree.c \ + irqbalance.c \ + irqlist.c \ + numa.c \ + placement.c \ + procinterrupts.c + +LOCAL_CFLAGS += -Wall -Wextra -DHAVE_GETOPT_LONG +LOCAL_SHARED_LIBRARIES := libcutils libglib + +LOCAL_MODULE_TAGS := optional +LOCAL_MODULE := irqbalance + +include $(BUILD_EXECUTABLE)
diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/COPYING
@@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License.
diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..73988b3 --- /dev/null +++ b/Makefile.am
@@ -0,0 +1,58 @@ +# Makefile.am -- +# Copyright 2009 Red Hat Inc., Durham, North Carolina. +# All Rights Reserved. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Authors: +# Steve Grubb <sgrubb@redhat.com> +# + +AUTOMAKE_OPTIONS = no-dependencies +ACLOCAL_AMFLAGS = -I m4 +EXTRA_DIST = COPYING autogen.sh misc/irqbalance.service misc/irqbalance.env + +SUBDIRS = tests + +UI_DIR = ui +AM_CFLAGS = $(LIBCAP_NG_CFLAGS) $(GLIB2_CFLAGS) +AM_CPPFLAGS = -I${top_srcdir} -W -Wall -Wshadow -Wformat -Wundef -D_GNU_SOURCE +noinst_HEADERS = bitmap.h constants.h cpumask.h irqbalance.h non-atomic.h \ + types.h $(UI_DIR)/helpers.h $(UI_DIR)/irqbalance-ui.h $(UI_DIR)/ui.h +sbin_PROGRAMS = irqbalance + +if IRQBALANCEUI +sbin_PROGRAMS += irqbalance-ui +endif + +irqbalance_SOURCES = activate.c bitmap.c classify.c cputree.c irqbalance.c \ + irqlist.c numa.c placement.c procinterrupts.c +irqbalance_LDADD = $(LIBCAP_NG_LIBS) $(GLIB2_LIBS) +if IRQBALANCEUI +irqbalance_ui_SOURCES = $(UI_DIR)/helpers.c $(UI_DIR)/irqbalance-ui.c \ + $(UI_DIR)/ui.c +irqbalance_ui_LDADD = $(GLIB2_LIBS) $(CURSES_LIBS) +endif + +dist_man_MANS = irqbalance.1 +if IRQBALANCEUI +dist_man_MANS += irqbalance-ui.1 +endif + +CONFIG_CLEAN_FILES = debug*.list config/* +clean-generic: + rm -rf autom4te*.cache + rm -f *.rej *.orig *~ +
diff --git a/NOTICE b/NOTICE new file mode 120000 index 0000000..d24842f --- /dev/null +++ b/NOTICE
@@ -0,0 +1 @@ +COPYING \ No newline at end of file
diff --git a/README.md b/README.md new file mode 100644 index 0000000..23427a9 --- /dev/null +++ b/README.md
@@ -0,0 +1,40 @@ +What is Irqbalance +================== + +Irqbalance is a daemon to help balance the cpu load generated by interrupts +across all of a systems cpus. Irqbalance identifies the highest volume +interrupt sources, and isolates each of them to a single unique cpu, so that +load is spread as much as possible over an entire processor set, while +minimizing cache miss rates for irq handlers. + +## Building and Installing [](https://travis-ci.org/Irqbalance/irqbalance) + +```bash +./autogen.sh +./configure [options] +make +make install +``` + +## Developing Irqbalance + +Irqbalance is currently hosted on github, and so developers are welcome to use +the issue/pull request/etc infrastructure found there. However, most +development discussions take place on the irqbalance mailing list, which can be +subscribed to at: +http://lists.infradead.org/mailman/listinfo/irqbalance + +New Developers are encouraged to use this mailing list to discuss ideas and +propose patches. + +## Bug reporting + +When something goes wrong, feel free to send us bugreport by one of the ways +described above. Your report should include: + +* Irqbalance version you've been using (or commit hash) +* `/proc/interrupts` output +* `irqbalance --debug` output +* content of smp_affinity files - can be obtained by e.g.: + `$ for i in $(seq 0 300); do grep . /proc/irq/$i/smp_affinity /dev/null 2>/dev/null; done` +* your hw hierarchy - e.g. `lstopo-no-graphics` output
diff --git a/activate.c b/activate.c new file mode 100644 index 0000000..065f880 --- /dev/null +++ b/activate.c
@@ -0,0 +1,87 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +/* + * This file contains the code to communicate a selected distribution / mapping + * of interrupts to the kernel. + */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> + +#include "irqbalance.h" + +static int check_affinity(struct irq_info *info, cpumask_t applied_mask) +{ + cpumask_t current_mask; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); + if (process_one_line(buf, get_mask_from_bitmap, ¤t_mask) < 0) + return 1; + + return cpus_equal(applied_mask, current_mask); +} + +static void activate_mapping(struct irq_info *info, void *data __attribute__((unused))) +{ + char buf[PATH_MAX]; + FILE *file; + int ret = 0; + + /* + * only activate mappings for irqs that have moved + */ + if (!info->moved) + return; + + if (!info->assigned_obj) + return; + + /* + * Don't activate anything for which we have an invalid mask + */ + if (check_affinity(info, info->assigned_obj->mask)) + return; + + sprintf(buf, "/proc/irq/%i/smp_affinity", info->irq); + file = fopen(buf, "w"); + if (!file) + return; + + cpumask_scnprintf(buf, PATH_MAX, info->assigned_obj->mask); + ret = fprintf(file, "%s", buf); + if (ret < 0) { + log(TO_ALL, LOG_WARNING, "cannot change irq %i's affinity, add it to banned list", info->irq); + add_banned_irq(info->irq); + remove_one_irq_from_db(info->irq); + } + fclose(file); + info->moved = 0; /*migration is done*/ +} + +void activate_mappings(void) +{ + for_each_irq(NULL, activate_mapping, NULL); +}
diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..b792e8b --- /dev/null +++ b/autogen.sh
@@ -0,0 +1,5 @@ +#! /bin/sh +set -x -e +mkdir -p m4 +# --no-recursive is available only in recent autoconf versions +autoreconf -fv --install
diff --git a/bitmap.c b/bitmap.c new file mode 100644 index 0000000..6a7421a --- /dev/null +++ b/bitmap.c
@@ -0,0 +1,463 @@ +/* + +This file is taken from the Linux kernel and minimally adapted for use in userspace + +*/ + +/* + * lib/bitmap.c + * Helper functions for bitmap.h. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +#include "config.h" +#include <unistd.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include "bitmap.h" +#include "non-atomic.h" + +/* + * bitmaps provide an array of bits, implemented using an an + * array of unsigned longs. The number of valid bits in a + * given bitmap does _not_ need to be an exact multiple of + * BITS_PER_LONG. + * + * The possible unused bits in the last, partially used word + * of a bitmap are 'don't care'. The implementation makes + * no particular effort to keep them zero. It ensures that + * their value will not affect the results of any operation. + * The bitmap operations that return Boolean (bitmap_empty, + * for example) or scalar (bitmap_weight, for example) results + * carefully filter out these unused bits from impacting their + * results. + * + * These operations actually hold to a slightly stronger rule: + * if you don't input any bitmaps to these ops that have some + * unused bits set, then they won't output any set unused bits + * in output bitmaps. + * + * The byte ordering of bitmaps is more natural on little + * endian architectures. See the big-endian headers + * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h + * for the best explanations of this ordering. + */ + +int __bitmap_empty(const unsigned long *bitmap, int bits) +{ + int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (bitmap[k]) + return 0; + + if (bits % BITS_PER_LONG) + if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) + return 0; + + return 1; +} + +int __bitmap_full(const unsigned long *bitmap, int bits) +{ + int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (~bitmap[k]) + return 0; + + if (bits % BITS_PER_LONG) + if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) + return 0; + + return 1; +} + +int __bitmap_weight(const unsigned long *bitmap, int bits) +{ + int k, w = 0, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; k++) + w += hweight_long(bitmap[k]); + + if (bits % BITS_PER_LONG) + w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits)); + + return w; +} + +int __bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (bitmap1[k] != bitmap2[k]) + return 0; + + if (bits % BITS_PER_LONG) + if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) + return 0; + + return 1; +} + +void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) +{ + int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + dst[k] = ~src[k]; + + if (bits % BITS_PER_LONG) + dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); +} + +/* + * __bitmap_shift_right - logical right shift of the bits in a bitmap + * @dst - destination bitmap + * @src - source bitmap + * @nbits - shift by this many bits + * @bits - bitmap size, in bits + * + * Shifting right (dividing) means moving bits in the MS -> LS bit + * direction. Zeros are fed into the vacated MS positions and the + * LS bits shifted off the bottom are lost. + */ +void __bitmap_shift_right(unsigned long *dst, + const unsigned long *src, int shift, int bits) +{ + int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; + int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; + unsigned long mask = (1UL << left) - 1; + for (k = 0; off + k < lim; ++k) { + unsigned long upper, lower; + + /* + * If shift is not word aligned, take lower rem bits of + * word above and make them the top rem bits of result. + */ + if (!rem || off + k + 1 >= lim) + upper = 0; + else { + upper = src[off + k + 1]; + if (off + k + 1 == lim - 1 && left) + upper &= mask; + } + lower = src[off + k]; + if (left && off + k == lim - 1) + lower &= mask; + dst[k] = upper << (BITS_PER_LONG - rem) | lower >> rem; + if (left && k == lim - 1) + dst[k] &= mask; + } + if (off) + memset(&dst[lim - off], 0, off*sizeof(unsigned long)); +} + + +/* + * __bitmap_shift_left - logical left shift of the bits in a bitmap + * @dst - destination bitmap + * @src - source bitmap + * @nbits - shift by this many bits + * @bits - bitmap size, in bits + * + * Shifting left (multiplying) means moving bits in the LS -> MS + * direction. Zeros are fed into the vacated LS bit positions + * and those MS bits shifted off the top are lost. + */ + +void __bitmap_shift_left(unsigned long *dst, + const unsigned long *src, int shift, int bits) +{ + int k, lim = BITS_TO_LONGS(bits), left = bits % BITS_PER_LONG; + int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG; + for (k = lim - off - 1; k >= 0; --k) { + unsigned long upper, lower; + + /* + * If shift is not word aligned, take upper rem bits of + * word below and make them the bottom rem bits of result. + */ + if (rem && k > 0) + lower = src[k - 1]; + else + lower = 0; + upper = src[k]; + if (left && k == lim - 1) + upper &= (1UL << left) - 1; + dst[k + off] = lower >> (BITS_PER_LONG - rem) | upper << rem; + if (left && k + off == lim - 1) + dst[k + off] &= (1UL << left) - 1; + } + if (off) + memset(dst, 0, off*sizeof(unsigned long)); +} + +void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k; + int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] & bitmap2[k]; +} + +void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k; + int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] | bitmap2[k]; +} + +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k; + int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] ^ bitmap2[k]; +} + +void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k; + int nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) + dst[k] = bitmap1[k] & ~bitmap2[k]; +} + +int __bitmap_intersects(const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits) +{ + int k, lim = bits/BITS_PER_LONG; + for (k = 0; k < lim; ++k) + if (bitmap1[k] & bitmap2[k]) + return 1; + + if (bits % BITS_PER_LONG) + if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) + return 1; + return 0; +} + +/* + * Bitmap printing & parsing functions: first version by Bill Irwin, + * second version by Paul Jackson, third by Joe Korty. + */ + +#define CHUNKSZ 32 +#define nbits_to_hold_value(val) fls(val) +#define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) +#define BASEDEC 10 /* fancier cpuset lists input in decimal */ + +/** + * bitmap_scnprintf - convert bitmap to an ASCII hex string. + * @buf: byte buffer into which string is placed + * @buflen: reserved size of @buf, in bytes + * @maskp: pointer to bitmap to convert + * @nmaskbits: size of bitmap, in bits + * + * Exactly @nmaskbits bits are displayed. Hex digits are grouped into + * comma-separated sets of eight digits per set. + */ +int bitmap_scnprintf(char *buf, unsigned int buflen, + const unsigned long *maskp, int nmaskbits) +{ + int i, word, bit, len = 0; + unsigned long val; + const char *sep = ""; + int chunksz; + uint32_t chunkmask; + int first = 1; + + chunksz = nmaskbits & (CHUNKSZ - 1); + if (chunksz == 0) + chunksz = CHUNKSZ; + + i = ALIGN(nmaskbits, CHUNKSZ) - CHUNKSZ; + for (; i >= 0; i -= CHUNKSZ) { + chunkmask = ((1ULL << chunksz) - 1); + word = i / BITS_PER_LONG; + bit = i % BITS_PER_LONG; + val = (maskp[word] >> bit) & chunkmask; + if (val!=0 || !first || i==0) { + len += snprintf(buf+len, buflen-len, "%s%0*lx", sep, + (chunksz+3)/4, val); + sep = ","; + first = 0; + } + chunksz = CHUNKSZ; + } + return len; +} + +/** + * __bitmap_parse - convert an ASCII hex string into a bitmap. + * @buf: pointer to buffer containing string. + * @buflen: buffer size in bytes. If string is smaller than this + * then it must be terminated with a \0. + * @is_user: location of buffer, 0 indicates kernel space + * @maskp: pointer to bitmap array that will contain result. + * @nmaskbits: size of bitmap, in bits. + * + * Commas group hex digits into chunks. Each chunk defines exactly 32 + * bits of the resultant bitmask. No chunk may specify a value larger + * than 32 bits (%-EOVERFLOW), and if a chunk specifies a smaller value + * then leading 0-bits are prepended. %-EINVAL is returned for illegal + * characters and for grouping errors such as "1,,5", ",44", "," and "". + * Leading and trailing whitespace accepted, but not embedded whitespace. + */ +int __bitmap_parse(const char *buf, unsigned int buflen, + int is_user __attribute((unused)), unsigned long *maskp, + int nmaskbits) +{ + int c, old_c, totaldigits, ndigits, nchunks, nbits; + uint32_t chunk; + + bitmap_zero(maskp, nmaskbits); + + nchunks = nbits = totaldigits = c = 0; + do { + chunk = ndigits = 0; + + /* Get the next chunk of the bitmap */ + while (buflen) { + old_c = c; + c = *buf++; + buflen--; + if (isspace(c)) + continue; + + /* + * If the last character was a space and the current + * character isn't '\0', we've got embedded whitespace. + * This is a no-no, so throw an error. + */ + if (totaldigits && c && isspace(old_c)) + return 0; + + /* A '\0' or a ',' signal the end of the chunk */ + if (c == '\0' || c == ',') + break; + + if (!isxdigit(c)) + return -EINVAL; + + /* + * Make sure there are at least 4 free bits in 'chunk'. + * If not, this hexdigit will overflow 'chunk', so + * throw an error. + */ + if (chunk & ~((1UL << (CHUNKSZ - 4)) - 1)) + return -EOVERFLOW; + + chunk = (chunk << 4) | unhex(c); + ndigits++; totaldigits++; + } + if (ndigits == 0) + return -EINVAL; + if (nchunks == 0 && chunk == 0) + continue; + + __bitmap_shift_left(maskp, maskp, CHUNKSZ, nmaskbits); + *maskp |= chunk; + nchunks++; + nbits += (nchunks == 1) ? nbits_to_hold_value(chunk) : CHUNKSZ; + if (nbits > nmaskbits) + return -EOVERFLOW; + } while (buflen && c == ','); + + return 0; +} + +/** + * __bitmap_parselist - convert list format ASCII string to bitmap + * @buf: read nul-terminated user string from this buffer + * @buflen: buffer size in bytes. If string is smaller than this + * then it must be terminated with a \0. + * @is_user: location of buffer, 0 indicates kernel space + * @maskp: write resulting mask here + * @nmaskbits: number of bits in mask to be written + * + * Input format is a comma-separated list of decimal numbers and + * ranges. Consecutively set bits are shown as two hyphen-separated + * decimal numbers, the smallest and largest bit numbers set in + * the range. + * + * Returns 0 on success, -errno on invalid input strings. + * Error values: + * %-EINVAL: second number in range smaller than first + * %-EINVAL: invalid character in string + * %-ERANGE: bit number specified too large for mask + */ +int __bitmap_parselist(const char *buf, unsigned int buflen, + int is_user __attribute((unused)), unsigned long *maskp, + int nmaskbits) +{ + int a, b, c, old_c, totaldigits; + int exp_digit, in_range; + + totaldigits = c = 0; + bitmap_zero(maskp, nmaskbits); + do { + exp_digit = 1; + in_range = 0; + a = b = 0; + + /* Get the next cpu# or a range of cpu#'s */ + while (buflen) { + old_c = c; + c = *buf++; + buflen--; + if (isspace(c)) + continue; + + /* + * If the last character was a space and the current + * character isn't '\0', we've got embedded whitespace. + * This is a no-no, so throw an error. + */ + if (totaldigits && c && isspace(old_c)) + return -EINVAL; + + /* A '\0' or a ',' signal the end of a cpu# or range */ + if (c == '\0' || c == ',') + break; + + if (c == '-') { + if (exp_digit || in_range) + return -EINVAL; + b = 0; + in_range = 1; + exp_digit = 1; + continue; + } + + if (!isdigit(c)) + return -EINVAL; + + b = b * 10 + (c - '0'); + if (!in_range) + a = b; + exp_digit = 0; + totaldigits++; + } + if (!(a <= b)) + return -EINVAL; + if (b >= nmaskbits) + return -ERANGE; + while (a <= b) { + set_bit(a, maskp); + a++; + } + } while (buflen && c == ','); + return 0; +}
diff --git a/bitmap.h b/bitmap.h new file mode 100644 index 0000000..7afce59 --- /dev/null +++ b/bitmap.h
@@ -0,0 +1,362 @@ +#ifndef __LINUX_BITMAP_H +#define __LINUX_BITMAP_H + +#ifndef __ASSEMBLY__ + +#include <string.h> +#include <stdint.h> +#include <unistd.h> + + +#define BITS_PER_LONG ((int)sizeof(unsigned long)*8) + +#define BITS_TO_LONGS(bits) \ + (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] +#define ALIGN(x,a) (((x)+(a)-1UL)&~((a)-1UL)) + + +#include "non-atomic.h" + +static inline unsigned int hweight32(unsigned int w) +{ + unsigned int res = w - ((w >> 1) & 0x55555555); + res = (res & 0x33333333) + ((res >> 2) & 0x33333333); + res = (res + (res >> 4)) & 0x0F0F0F0F; + res = res + (res >> 8); + return (res + (res >> 16)) & 0x000000FF; +} + +static inline unsigned long hweight64(uint64_t w) +{ + if (BITS_PER_LONG == 32) + return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w); + + w -= (w >> 1) & 0x5555555555555555ull; + w = (w & 0x3333333333333333ull) + ((w >> 2) & 0x3333333333333333ull); + w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0full; + return (w * 0x0101010101010101ull) >> 56; +} + + +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +static inline unsigned long hweight_long(unsigned long w) +{ + return sizeof(w) == 4 ? hweight32(w) : hweight64(w); +} + +#define min(x,y) ({ \ + typeof(x) _x = (x); \ + typeof(y) _y = (y); \ + (void) (&_x == &_y); \ + _x < _y ? _x : _y; }) + + +/* + * bitmaps provide bit arrays that consume one or more unsigned + * longs. The bitmap interface and available operations are listed + * here, in bitmap.h + * + * Function implementations generic to all architectures are in + * lib/bitmap.c. Functions implementations that are architecture + * specific are in various include/asm-<arch>/bitops.h headers + * and other arch/<arch> specific files. + * + * See lib/bitmap.c for more details. + */ + +/* + * The available bitmap operations and their rough meaning in the + * case that the bitmap is a single unsigned long are thus: + * + * Note that nbits should be always a compile time evaluable constant. + * Otherwise many inlines will generate horrible code. + * + * bitmap_zero(dst, nbits) *dst = 0UL + * bitmap_fill(dst, nbits) *dst = ~0UL + * bitmap_copy(dst, src, nbits) *dst = *src + * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 + * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 + * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) + * bitmap_complement(dst, src, nbits) *dst = ~(*src) + * bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal? + * bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap? + * bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2? + * bitmap_empty(src, nbits) Are all bits zero in *src? + * bitmap_full(src, nbits) Are all bits set in *src? + * bitmap_weight(src, nbits) Hamming Weight: number set bits + * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n + * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n + * bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src) + * bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit) + * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf + * bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf + * bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf + * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf + * bitmap_parselist(buf, dst, nbits) Parse bitmap dst from list + * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region + * bitmap_release_region(bitmap, pos, order) Free specified bit region + * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region + */ + +/* + * Also the following operations in asm/bitops.h apply to bitmaps. + * + * set_bit(bit, addr) *addr |= bit + * clear_bit(bit, addr) *addr &= ~bit + * change_bit(bit, addr) *addr ^= bit + * test_bit(bit, addr) Is bit set in *addr? + * test_and_set_bit(bit, addr) Set bit and return old value + * test_and_clear_bit(bit, addr) Clear bit and return old value + * test_and_change_bit(bit, addr) Change bit and return old value + * find_first_zero_bit(addr, nbits) Position first zero bit in *addr + * find_first_bit(addr, nbits) Position first set bit in *addr + * find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit + * find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit + */ + +/* + * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used + * to declare an array named 'name' of just enough unsigned longs to + * contain all bit positions from 0 to 'bits' - 1. + */ + +/* + * lib/bitmap.c provides these functions: + */ + +extern int __bitmap_empty(const unsigned long *bitmap, int bits); +extern int __bitmap_full(const unsigned long *bitmap, int bits); +extern int __bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, + int bits); +extern void __bitmap_shift_right(unsigned long *dst, + const unsigned long *src, int shift, int bits); +extern void __bitmap_shift_left(unsigned long *dst, + const unsigned long *src, int shift, int bits); +extern void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern int __bitmap_intersects(const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern int __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, int bits); +extern int __bitmap_weight(const unsigned long *bitmap, int bits); + +extern int bitmap_scnprintf(char *buf, unsigned int len, + const unsigned long *src, int nbits); +extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, + unsigned long *dst, int nbits); +extern int bitmap_scnlistprintf(char *buf, unsigned int len, + const unsigned long *src, int nbits); +extern int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, + unsigned long *dst, int nbits); +extern void bitmap_remap(unsigned long *dst, const unsigned long *src, + const unsigned long *old, const unsigned long *new, int bits); +extern int bitmap_bitremap(int oldbit, + const unsigned long *old, const unsigned long *new, int bits); +extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); +extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); +extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); + +#define BITMAP_LAST_WORD_MASK(nbits) \ +( \ + ((nbits) % BITS_PER_LONG) ? \ + (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ +) + +static inline void bitmap_zero(unsigned long *dst, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = 0UL; + else { + int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + memset(dst, 0, len); + } +} + +static inline void bitmap_fill(unsigned long *dst, int nbits) +{ + size_t nlongs = BITS_TO_LONGS(nbits); + if (nlongs > 1) { + int len = (nlongs - 1) * sizeof(unsigned long); + memset(dst, 0xff, len); + } + dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); +} + +static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, + int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src; + else { + int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + memcpy(dst, src, len); + } +} + +static inline void bitmap_and(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src1 & *src2; + else + __bitmap_and(dst, src1, src2, nbits); +} + +static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src1 | *src2; + else + __bitmap_or(dst, src1, src2, nbits); +} + +static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src1 ^ *src2; + else + __bitmap_xor(dst, src1, src2, nbits); +} + +static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src1 & ~(*src2); + else + __bitmap_andnot(dst, src1, src2, nbits); +} + +static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, + int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); + else + __bitmap_complement(dst, src, nbits); +} + +static inline int bitmap_equal(const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_equal(src1, src2, nbits); +} + +static inline int bitmap_intersects(const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; + else + return __bitmap_intersects(src1, src2, nbits); +} + +static inline int bitmap_subset(const unsigned long *src1, + const unsigned long *src2, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_subset(src1, src2, nbits); +} + +static inline int bitmap_empty(const unsigned long *src, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_empty(src, nbits); +} + +static inline int bitmap_full(const unsigned long *src, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); + else + return __bitmap_full(src, nbits); +} + +static inline int bitmap_weight(const unsigned long *src, int nbits) +{ + if (nbits <= BITS_PER_LONG) + return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); + return __bitmap_weight(src, nbits); +} + +static inline void bitmap_shift_right(unsigned long *dst, + const unsigned long *src, int n, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = *src >> n; + else + __bitmap_shift_right(dst, src, n, nbits); +} + +static inline void bitmap_shift_left(unsigned long *dst, + const unsigned long *src, int n, int nbits) +{ + if (nbits <= BITS_PER_LONG) + *dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits); + else + __bitmap_shift_left(dst, src, n, nbits); +} + +static inline int bitmap_parse(const char *buf, unsigned int buflen, + unsigned long *maskp, int nmaskbits) +{ + return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); +} + +static inline int bitmap_parselist(const char *buf, unsigned int buflen, + unsigned long *maskp, int nmaskbits) +{ + return __bitmap_parselist(buf, buflen, 0, maskp, nmaskbits); +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __LINUX_BITMAP_H */
diff --git a/classify.c b/classify.c new file mode 100644 index 0000000..fa25206 --- /dev/null +++ b/classify.c
@@ -0,0 +1,815 @@ +#include "config.h" +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> +#include <assert.h> +#include <errno.h> + +#include "irqbalance.h" +#include "types.h" + + +char *classes[] = { + "other", + "legacy", + "storage", + "video", + "ethernet", + "gbit-ethernet", + "10gbit-ethernet", + "virt-event", + 0 +}; + +static int map_class_to_level[8] = +{ BALANCE_CORE, BALANCE_CACHE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE }; + +struct user_irq_policy { + int ban; + int level; + int numa_node_set; + int numa_node; +}; + +static GList *interrupts_db = NULL; +static GList *banned_irqs = NULL; +GList *cl_banned_irqs = NULL; +static GList *cl_banned_modules = NULL; + +#define SYSFS_DIR "/sys" +#define SYSPCI_DIR "/sys/bus/pci/devices" + +#define PCI_MAX_CLASS 0x14 +#define PCI_MAX_SERIAL_SUBCLASS 0x81 + +#define PCI_INVAL_DATA 0xFFFFFFFF + +struct pci_info { + unsigned short vendor; + unsigned short device; + unsigned short sub_vendor; + unsigned short sub_device; + unsigned int class; +}; + +/* PCI vendor ID, device ID */ +#define PCI_VENDOR_PLX 0x10b5 +#define PCI_DEVICE_PLX_PEX8619 0x8619 +#define PCI_VENDOR_CAVIUM 0x177d +#define PCI_DEVICE_CAVIUM_CN61XX 0x0093 + +/* PCI subsystem vendor ID, subsystem device ID */ +#define PCI_SUB_VENDOR_EMC 0x1120 +#define PCI_SUB_DEVICE_EMC_055B 0x055b +#define PCI_SUB_DEVICE_EMC_0568 0x0568 +#define PCI_SUB_DEVICE_EMC_dd00 0xdd00 + +/* + * Apply software workarounds for some special devices + * + * The world is not perfect and supplies us with broken PCI devices. + * Usually there are two sort of cases: + * + * 1. The device is special + * Before shipping the devices, PCI spec doesn't have the definitions. + * + * 2. Buggy PCI devices + * Some PCI devices don't follow the PCI class code definitions. + */ +static void apply_pci_quirks(const struct pci_info *pci, int *irq_class) +{ + if ((pci->vendor == PCI_VENDOR_PLX) && + (pci->device == PCI_DEVICE_PLX_PEX8619) && + (pci->sub_vendor == PCI_SUB_VENDOR_EMC)) { + switch (pci->sub_device) { + case PCI_SUB_DEVICE_EMC_055B: + case PCI_SUB_DEVICE_EMC_dd00: + *irq_class = IRQ_SCSI; + break; + } + } + + if ((pci->vendor == PCI_VENDOR_CAVIUM) && + (pci->device == PCI_DEVICE_CAVIUM_CN61XX) && + (pci->sub_vendor == PCI_SUB_VENDOR_EMC)) { + switch (pci->sub_device) { + case PCI_SUB_DEVICE_EMC_0568: + *irq_class = IRQ_SCSI; + break; + } + } + + return; +} + +/* Determin IRQ class based on PCI class code */ +static int map_pci_irq_class(unsigned int pci_class) +{ + unsigned int major = pci_class >> 16; + unsigned int sub = (pci_class & 0xFF00) >> 8; + int irq_class = IRQ_NODEF; + /* + * Class codes lifted from below PCI-SIG spec: + * + * PCI Code and ID Assignment Specification v1.5 + * + * and mapped to irqbalance types here. + * + * IRQ_NODEF will go through classification by PCI sub-class code. + */ + static short major_class_codes[PCI_MAX_CLASS] = { + IRQ_OTHER, + IRQ_SCSI, + IRQ_ETH, + IRQ_VIDEO, + IRQ_OTHER, + IRQ_OTHER, + IRQ_LEGACY, + IRQ_OTHER, + IRQ_OTHER, + IRQ_LEGACY, + IRQ_OTHER, + IRQ_OTHER, + IRQ_NODEF, + IRQ_ETH, + IRQ_SCSI, + IRQ_OTHER, + IRQ_OTHER, + IRQ_OTHER, + IRQ_LEGACY, + IRQ_LEGACY, + }; + + /* + * All sub-class code for serial bus controllers. + * The major class code is 0xc. + */ + static short serial_sub_codes[PCI_MAX_SERIAL_SUBCLASS] = { + IRQ_LEGACY, + IRQ_LEGACY, + IRQ_LEGACY, + IRQ_LEGACY, + IRQ_SCSI, + IRQ_LEGACY, + IRQ_SCSI, + IRQ_LEGACY, + IRQ_LEGACY, + IRQ_LEGACY, + [0xa ... 0x7f] = IRQ_NODEF, + IRQ_LEGACY, + }; + + /* + * Check major class code first + */ + + if (major >= PCI_MAX_CLASS) + return IRQ_NODEF; + + switch (major) { + case 0xc: /* Serial bus class */ + if (sub >= PCI_MAX_SERIAL_SUBCLASS) + return IRQ_NODEF; + irq_class = serial_sub_codes[sub]; + break; + default: /* All other PCI classes */ + irq_class = major_class_codes[major]; + break; + } + + return irq_class; +} + +/* Read specific data from sysfs */ +static unsigned int read_pci_data(const char *devpath, const char* file) +{ + char path[PATH_MAX]; + unsigned int data = PCI_INVAL_DATA; + + sprintf(path, "%s/%s", devpath, file); + if (process_one_line(path, get_hex, &data) < 0) + log(TO_CONSOLE, LOG_WARNING, "PCI: can't get from file:%s\n", path); + + return data; +} + +/* Get pci information for IRQ classification */ +static int get_pci_info(const char *devpath, struct pci_info *pci) +{ + unsigned int data = PCI_INVAL_DATA; + + if ((data = read_pci_data(devpath, "vendor")) == PCI_INVAL_DATA) + return -ENODEV; + pci->vendor = (unsigned short)data; + + if ((data = read_pci_data(devpath, "device")) == PCI_INVAL_DATA) + return -ENODEV; + pci->device = (unsigned short)data; + + if ((data = read_pci_data(devpath, "subsystem_vendor")) == PCI_INVAL_DATA) + return -ENODEV; + pci->sub_vendor = (unsigned short)data; + + if ((data = read_pci_data(devpath, "subsystem_device")) == PCI_INVAL_DATA) + return -ENODEV; + pci->sub_device = (unsigned short)data; + + if ((data = read_pci_data(devpath, "class")) == PCI_INVAL_DATA) + return -ENODEV; + pci->class = data; + + return 0; +} + +/* Return IRQ class for given devpath */ +static int get_irq_class(const char *devpath) +{ + int irq_class = IRQ_NODEF; + struct pci_info pci; + + /* Get PCI info from sysfs */ + if (get_pci_info(devpath, &pci) < 0) + return IRQ_NODEF; + + /* Map PCI class code to irq class */ + irq_class = map_pci_irq_class(pci.class); + if (irq_class < 0) { + log(TO_CONSOLE, LOG_WARNING, "Invalid PCI class code %d\n", + pci.class); + return IRQ_NODEF; + } + + /* Reassign irq class for some buggy devices */ + apply_pci_quirks(&pci, &irq_class); + + return irq_class; +} + +static gint compare_ints(gconstpointer a, gconstpointer b) +{ + const struct irq_info *ai = a; + const struct irq_info *bi = b; + + return ai->irq - bi->irq; +} + +static void __add_banned_irq(int irq, GList **list) +{ + struct irq_info find, *new; + GList *entry; + + find.irq = irq; + entry = g_list_find_custom(*list, &find, compare_ints); + if (entry) + return; + + new = calloc(1, sizeof(struct irq_info)); + if (!new) { + log(TO_CONSOLE, LOG_WARNING, "No memory to ban irq %d\n", irq); + return; + } + + new->irq = irq; + new->flags |= IRQ_FLAG_BANNED; + + *list = g_list_append(*list, new); + log(TO_CONSOLE, LOG_INFO, "IRQ %d was BANNED.\n", irq); + return; +} + +void add_banned_irq(int irq) +{ + __add_banned_irq(irq, &banned_irqs); +} + +void add_cl_banned_irq(int irq) +{ + __add_banned_irq(irq, &cl_banned_irqs); +} + +gint substr_find(gconstpointer a, gconstpointer b) +{ + if (strstr(b, a)) + return 0; + else + return 1; +} + +static void add_banned_module(char *modname, GList **modlist) +{ + GList *entry; + char *newmod; + + entry = g_list_find_custom(*modlist, modname, substr_find); + if (entry) + return; + + newmod = strdup(modname); + if (!newmod) { + log(TO_CONSOLE, LOG_WARNING, "No memory to ban module %s\n", modname); + return; + } + + *modlist = g_list_append(*modlist, newmod); +} + +void add_cl_banned_module(char *modname) +{ + add_banned_module(modname, &cl_banned_modules); +} + + +/* + * Inserts an irq_info struct into the intterupts_db list + * devpath points to the device directory in sysfs for the + * related device. NULL devpath means no sysfs entries for + * this irq. + */ +static struct irq_info *add_one_irq_to_db(const char *devpath, struct irq_info *hint, struct user_irq_policy *pol) +{ + int irq = hint->irq; + struct irq_info *new; + int numa_node; + char path[PATH_MAX]; + + new = calloc(1, sizeof(struct irq_info)); + if (!new) + return NULL; + + new->irq = irq; + new->type = hint->type; + new->class = hint->class; + + interrupts_db = g_list_append(interrupts_db, new); + + /* Some special irqs have NULL devpath */ + if (devpath != NULL) { + /* Map PCI class code to irq class */ + int irq_class = get_irq_class(devpath); + if (irq_class < 0) + goto get_numa_node; + new->class = irq_class; + } + + if (pol->level >= 0) + new->level = pol->level; + else + new->level = map_class_to_level[new->class]; + +get_numa_node: + numa_node = NUMA_NO_NODE; + if (devpath != NULL && numa_avail) { + sprintf(path, "%s/numa_node", devpath); + process_one_line(path, get_int, &numa_node); + } + + if (pol->numa_node_set == 1) + new->numa_node = get_numa_node(pol->numa_node); + else + new->numa_node = get_numa_node(numa_node); + + cpus_setall(new->cpumask); + if (devpath != NULL) { + sprintf(path, "%s/local_cpus", devpath); + process_one_line(path, get_mask_from_bitmap, &new->cpumask); + } + + log(TO_CONSOLE, LOG_INFO, "Adding IRQ %d to database\n", irq); + return new; +} + +void remove_one_irq_from_db(int irq) +{ + struct irq_info find, *tmp; + GList *entry = NULL; + + find.irq = irq; + entry = g_list_find_custom(interrupts_db, &find, compare_ints); + if (!entry) + return; + + tmp = entry->data; + interrupts_db = g_list_remove(interrupts_db, tmp); + free(tmp); + log(TO_CONSOLE, LOG_INFO, "IRQ %d was removed from db.\n", irq); + return; +} + +static void parse_user_policy_key(char *buf, int irq, struct user_irq_policy *pol) +{ + char *key, *value, *end; + char *levelvals[] = { "none", "package", "cache", "core" }; + int idx; + int key_set = 1; + + key = buf; + value = strchr(buf, '='); + + if (!value) { + log(TO_SYSLOG, LOG_WARNING, "Bad format for policy, ignoring: %s\n", buf); + return; + } + + /* NULL terminate the key and advance value to the start of the value + * string + */ + *value = '\0'; + value++; + end = strchr(value, '\n'); + if (end) + *end = '\0'; + + if (!strcasecmp("ban", key)) { + if (!strcasecmp("false", value)) + pol->ban = 0; + else if (!strcasecmp("true", value)) + pol->ban = 1; + else { + key_set = 0; + log(TO_ALL, LOG_WARNING, "Unknown value for ban policy: %s\n", value); + } + } else if (!strcasecmp("balance_level", key)) { + for (idx=0; idx<4; idx++) { + if (!strcasecmp(levelvals[idx], value)) + break; + } + + if (idx>3) { + key_set = 0; + log(TO_ALL, LOG_WARNING, "Bad value for balance_level policy: %s\n", value); + } else + pol->level = idx; + } else if (!strcasecmp("numa_node", key)) { + idx = strtoul(value, NULL, 10); + if (!get_numa_node(idx)) { + log(TO_ALL, LOG_WARNING, "NUMA node %d doesn't exist\n", + idx); + return; + } + pol->numa_node = idx; + pol->numa_node_set = 1; + } else { + key_set = 0; + log(TO_ALL, LOG_WARNING, "Unknown key returned, ignoring: %s\n", key); + } + + if (key_set) + log(TO_ALL, LOG_INFO, "IRQ %d: Override %s to %s\n", irq, key, value); + + +} + +static int run_script_for_policy(char *script, char *path, int irq, struct user_irq_policy *pol) +{ + char *cmd; + char *brc; + FILE *output; + char buffer[128]; + + cmd = alloca(strlen(path)+strlen(script)+64); + if (!cmd) + return -1; + + sprintf(cmd, "exec %s %s %d", script, path, irq); + output = popen(cmd, "r"); + if (!output) { + log(TO_ALL, LOG_WARNING, "Unable to execute user policy script %s\n", script); + return 1; /* tell caller to ignore this script */ + } + + while(!feof(output)) { + brc = fgets(buffer, 128, output); + if (brc) + parse_user_policy_key(brc, irq, pol); + } + return WEXITSTATUS(pclose(output)); +} + +/* + * Calls out to a possibly user defined script to get user assigned policy + * aspects for a given irq. A value of -1 in a given field indicates no + * policy was given and that system defaults should be used + */ +static void get_irq_user_policy(char *path, int irq, struct user_irq_policy *pol) +{ + struct stat sbuf; + DIR *poldir; + struct dirent *entry; + int ret; + char script[1024]; + + memset(pol, -1, sizeof(struct user_irq_policy)); + + /* Return defaults if no script was given */ + if (!polscript) + return; + + if (stat(polscript, &sbuf)) + return; + + /* Use SYSFS_DIR for irq has no sysfs entries */ + if (!path) + path = SYSFS_DIR; + + if (!S_ISDIR(sbuf.st_mode)) { + if (run_script_for_policy(polscript, path, irq, pol) != 0) { + log(TO_CONSOLE, LOG_ERR, "policy script returned non-zero code! skipping user policy\n"); + memset(pol, -1, sizeof(struct user_irq_policy)); + } + } else { + /* polscript is a directory, user multiple script semantics */ + poldir = opendir(polscript); + + if (poldir) { + while ((entry = readdir(poldir)) != NULL) { + snprintf(script, sizeof(script), "%s/%s", polscript, entry->d_name); + if (stat(script, &sbuf)) + continue; + if (S_ISREG(sbuf.st_mode)) { + if (!(sbuf.st_mode & S_IXUSR)) { + log(TO_CONSOLE, LOG_DEBUG, "Skipping script %s due to lack of executable permission\n", script); + continue; + } + + memset(pol, -1, sizeof(struct user_irq_policy)); + ret = run_script_for_policy(script, path, irq, pol); + if ((ret < 0) || (ret >= 2)) { + log(TO_CONSOLE, LOG_ERR, "Error executing policy script %s : %d\n", script, ret); + continue; + } + + /* a ret of 1 means this script isn't + * for this irq + */ + if (ret == 1) + continue; + + log(TO_CONSOLE, LOG_DEBUG, "Accepting script %s to define policy for irq %d\n", script, irq); + break; + } + } + closedir(poldir); + } + } +} + +static int check_for_module_ban(char *name) +{ + GList *entry; + + entry = g_list_find_custom(cl_banned_modules, name, substr_find); + + if (entry) + return 1; + else + return 0; +} + +static int check_for_irq_ban(int irq, GList *proc_interrupts) +{ + struct irq_info find, *res; + GList *entry; + + /* + * Check to see if we banned this irq on the command line + */ + find.irq = irq; + entry = g_list_find_custom(cl_banned_irqs, &find, compare_ints); + if (entry) + return 1; + + /* + * Check to see if we banned module which the irq belongs to. + */ + entry = g_list_find_custom(proc_interrupts, &find, compare_ints); + if (entry) { + res = entry->data; + if (check_for_module_ban(res->name)) + return 1; + } + + return 0; +} + +static void add_new_irq(char *path, struct irq_info *hint, GList *proc_interrupts) +{ + struct irq_info *new; + struct user_irq_policy pol; + int irq = hint->irq; + + new = get_irq_info(irq); + if (new) + return; + + /* Set NULL devpath for the irq has no sysfs entries */ + get_irq_user_policy(path, irq, &pol); + if ((pol.ban == 1) || check_for_irq_ban(irq, proc_interrupts)) { /*FIXME*/ + __add_banned_irq(irq, &banned_irqs); + new = get_irq_info(irq); + } else + new = add_one_irq_to_db(path, hint, &pol); + + if (!new) + log(TO_CONSOLE, LOG_WARNING, "add_new_irq: Failed to add irq %d\n", irq); +} + +/* + * Figures out which interrupt(s) relate to the device we"re looking at in dirname + */ +static void build_one_dev_entry(const char *dirname, GList *tmp_irqs) +{ + struct dirent *entry; + DIR *msidir; + int irqnum; + struct irq_info hint; + char path[PATH_MAX]; + char devpath[PATH_MAX]; + + sprintf(path, "%s/%s/msi_irqs", SYSPCI_DIR, dirname); + sprintf(devpath, "%s/%s", SYSPCI_DIR, dirname); + + /* Needs to be further classified */ + hint.class = IRQ_OTHER; + + msidir = opendir(path); + + if (msidir) { + do { + entry = readdir(msidir); + if (!entry) + break; + irqnum = strtol(entry->d_name, NULL, 10); + if (irqnum) { + hint.irq = irqnum; + hint.type = IRQ_TYPE_MSIX; + add_new_irq(devpath, &hint, tmp_irqs); + } + } while (entry != NULL); + closedir(msidir); + return; + } + + sprintf(path, "%s/%s/irq", SYSPCI_DIR, dirname); + if (process_one_line(path, get_int, &irqnum) < 0) + goto done; + + /* + * no pci device has irq 0 + * irq 255 is invalid on x86/x64 architectures + */ +#if defined(__i386__) || defined(__x86_64__) + if (irqnum && irqnum != 255) { +#else + if (irqnum) { +#endif + hint.irq = irqnum; + hint.type = IRQ_TYPE_LEGACY; + add_new_irq(devpath, &hint, tmp_irqs); + } + +done: + return; +} + +static void free_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + free(info); +} + +void free_irq_db(void) +{ + for_each_irq(NULL, free_irq, NULL); + g_list_free(interrupts_db); + interrupts_db = NULL; + for_each_irq(banned_irqs, free_irq, NULL); + g_list_free(banned_irqs); + banned_irqs = NULL; + g_list_free(rebalance_irq_list); + rebalance_irq_list = NULL; +} + +void free_cl_opts(void) +{ + g_list_free_full(cl_banned_modules, free); + g_list_free_full(cl_banned_irqs, free); +} + +static void add_missing_irq(struct irq_info *info, void *attr) +{ + GList *proc_interrupts = (GList *) attr; + + add_new_irq(NULL, info, proc_interrupts); +} + +static void free_tmp_irqs(gpointer data) +{ + struct irq_info *info = data; + + free(info->name); + free(info); +} + +void rebuild_irq_db(void) +{ + DIR *devdir; + struct dirent *entry; + GList *tmp_irqs = NULL; + + free_irq_db(); + + tmp_irqs = collect_full_irq_list(); + + devdir = opendir(SYSPCI_DIR); + + if (devdir) { + do { + entry = readdir(devdir); + + if (!entry) + break; + + build_one_dev_entry(entry->d_name, tmp_irqs); + + } while (entry != NULL); + + closedir(devdir); + } + + for_each_irq(tmp_irqs, add_missing_irq, interrupts_db); + + g_list_free_full(tmp_irqs, free_tmp_irqs); + +} + +void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data) +{ + GList *entry = g_list_first(list ? list : interrupts_db); + GList *next; + + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + +struct irq_info *get_irq_info(int irq) +{ + GList *entry; + struct irq_info find; + + find.irq = irq; + entry = g_list_find_custom(interrupts_db, &find, compare_ints); + + if (!entry) + entry = g_list_find_custom(banned_irqs, &find, compare_ints); + + return entry ? entry->data : NULL; +} + +void migrate_irq(GList **from, GList **to, struct irq_info *info) +{ + GList *entry; + struct irq_info find, *tmp; + + find.irq = info->irq; + entry = g_list_find_custom(*from, &find, compare_ints); + + if (!entry) + return; + + tmp = entry->data; + *from = g_list_delete_link(*from, entry); + + + *to = g_list_append(*to, tmp); + info->moved = 1; +} + +static gint sort_irqs(gconstpointer A, gconstpointer B) +{ + struct irq_info *a, *b; + + a = (struct irq_info*)A; + b = (struct irq_info*)B; + + if (a->class < b->class) + return 1; + if (a->class > b->class) + return -1; + if (a->load < b->load) + return 1; + if (a->load > b->load) + return -1; + if (a < b) + return 1; + return -1; +} + +void sort_irq_list(GList **list) +{ + *list = g_list_sort(*list, sort_irqs); +}
diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..16cd164 --- /dev/null +++ b/configure.ac
@@ -0,0 +1,91 @@ +AC_INIT(irqbalance,1.6.0) +AC_PREREQ(2.12)dnl +AM_CONFIG_HEADER(config.h) + +AC_CONFIG_MACRO_DIR([m4]) +AM_INIT_AUTOMAKE([foreign] [subdir-objects]) +AM_PROG_LIBTOOL +AC_SUBST(LIBTOOL_DEPS) + +AC_PROG_CC +AC_PROG_INSTALL +AC_PROG_AWK + +AC_ARG_ENABLE([numa], + AS_HELP_STRING([--disable-numa], [enable numa support (default is auto)])) +AS_IF([test "$enable_numa" = "no"],[ + ac_cv_header_numa_h=no + ac_cv_lib_numa_numa_available=no +]) + +AC_HEADER_STDC +AC_CHECK_HEADERS([numa.h]) + +AC_CHECK_FUNCS(getopt_long) + +AC_CHECK_LIB(numa, numa_available) +AC_CHECK_LIB(m, floor) + +PKG_CHECK_MODULES([GLIB2], [glib-2.0], [], [AC_MSG_ERROR([glib-2.0 is required])]) + +PKG_CHECK_MODULES([NCURSESW], [ncursesw], [has_ncursesw=yes], [AC_CHECK_LIB(curses, mvprintw)]) +AS_IF([test "x$has_ncursesw" = "xyes"], [ + AC_SUBST([NCURSESW_CFLAGS]) + AC_SUBST([NCURSESW_LIBS]) + LIBS="$LIBS $NCURSESW_LIBS" + AC_SUBST([LIBS]) +]) + +AC_C_CONST +AC_C_INLINE +AM_PROG_CC_C_O + +AC_ARG_WITH([irqbalance-ui], + [AC_HELP_STRING([--without-irqbalance-ui], + [Dont build the irqbalance ui component])], + [with_irqbalanceui=$withval], [with_irqbalanceui=yes]) + +AM_CONDITIONAL([IRQBALANCEUI], [test x$with_irqbalanceui = xyes]) + +AC_ARG_WITH([systemd], + [ AS_HELP_STRING([--with-systemd],[Add systemd-lib support])] +) +AS_IF( + [test "x$with_systemd" = xyes], [ + PKG_CHECK_MODULES([SYSTEMD], [libsystemd], [journal_lib=yes], [journal_lib=no]) + AS_IF([test "x$journal_lib" != "xyes"], [ + PKG_CHECK_MODULES([SYSTEMD], [libsystemd-journal], [journal_lib=yes]) + ]) + AC_DEFINE(HAVE_LIBSYSTEMD, 1, [systemd support]) + AC_CHECK_LIB([systemd], [sd_journal_print_with_location]) + AC_CHECK_LIB([systemd], [sd_journal_print]) +]) + +AC_ARG_WITH([libcap-ng], + AS_HELP_STRING([libcap-ng], [Add libcap-ng-support @<:@default=auto@:>@])) + +AS_IF( + [test "x$with_libcap_ng" != "xno"], + [ + PKG_CHECK_MODULES([LIBCAP_NG], [libcap-ng], + [AC_DEFINE(HAVE_LIBCAP_NG,1,[libcap-ng support])], + [ + AS_IF( + [test "x$libcap_ng" = "xyes"], + [ + AC_MSG_ERROR([libcap-ng not found]) + ] + ) + ] + ) + ] +) + +AC_OUTPUT(Makefile tests/Makefile) + +AC_MSG_NOTICE() +AC_MSG_NOTICE([irqbalance Version: $VERSION]) +AC_MSG_NOTICE([Target: $target]) +AC_MSG_NOTICE([Installation prefix: $prefix]) +AC_MSG_NOTICE([Compiler: $CC]) +AC_MSG_NOTICE([Compiler flags: $CFLAGS])
diff --git a/constants.h b/constants.h new file mode 100644 index 0000000..8e34339 --- /dev/null +++ b/constants.h
@@ -0,0 +1,33 @@ +#ifndef __INCLUDE_GUARD_CONSTANTS_H +#define __INCLUDE_GUARD_CONSTANTS_H + +/* interval between rebalance attempts in seconds */ +#define SLEEP_INTERVAL 10 + +#define NSEC_PER_SEC 1e9 + +/* NUMA topology refresh intervals, in units of SLEEP_INTERVAL */ +#define NUMA_REFRESH_INTERVAL 32 +/* NIC interrupt refresh interval, in units of SLEEP_INTERVAL */ +#define NIC_REFRESH_INTERVAL 32 + +/* minimum number of interrupts since boot for an interrupt to matter */ +#define MIN_IRQ_COUNT 20 + + +/* balancing tunings */ + +#define CROSS_PACKAGE_PENALTY 3000 +#define NUMA_PENALTY 500 +#define POWER_MODE_PACKAGE_THRESHOLD 20000 +#define CLASS_VIOLATION_PENTALTY 6000 +#define MSI_CACHE_PENALTY 10000 +#define CORE_SPECIFIC_THRESHOLD 5000 + +/* power mode */ + +#define POWER_MODE_SOFTIRQ_THRESHOLD 20 +#define POWER_MODE_HYSTERESIS 3 + + +#endif
diff --git a/cpumask.h b/cpumask.h new file mode 100644 index 0000000..5bebbeb --- /dev/null +++ b/cpumask.h
@@ -0,0 +1,323 @@ +#ifndef __LINUX_CPUMASK_H +#define __LINUX_CPUMASK_H + +#define NR_CPUS 4096 +/* + * Cpumasks provide a bitmap suitable for representing the + * set of CPU's in a system, one bit position per CPU number. + * + * See detailed comments in the file linux/bitmap.h describing the + * data type on which these cpumasks are based. + * + * For details of cpumask_scnprintf() and cpumask_parse_user(), + * see bitmap_scnprintf() and bitmap_parse_user() in lib/bitmap.c. + * For details of cpulist_scnprintf() and cpulist_parse(), see + * bitmap_scnlistprintf() and bitmap_parselist(), also in bitmap.c. + * For details of cpu_remap(), see bitmap_bitremap in lib/bitmap.c + * For details of cpus_remap(), see bitmap_remap in lib/bitmap.c. + * + * The available cpumask operations are: + * + * void cpu_set(cpu, mask) turn on bit 'cpu' in mask + * void cpu_clear(cpu, mask) turn off bit 'cpu' in mask + * void cpus_setall(mask) set all bits + * void cpus_clear(mask) clear all bits + * int cpu_isset(cpu, mask) true iff bit 'cpu' set in mask + * int cpu_test_and_set(cpu, mask) test and set bit 'cpu' in mask + * + * void cpus_and(dst, src1, src2) dst = src1 & src2 [intersection] + * void cpus_or(dst, src1, src2) dst = src1 | src2 [union] + * void cpus_xor(dst, src1, src2) dst = src1 ^ src2 + * void cpus_andnot(dst, src1, src2) dst = src1 & ~src2 + * void cpus_complement(dst, src) dst = ~src + * + * int cpus_equal(mask1, mask2) Does mask1 == mask2? + * int cpus_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * int cpus_subset(mask1, mask2) Is mask1 a subset of mask2? + * int cpus_empty(mask) Is mask empty (no bits sets)? + * int cpus_full(mask) Is mask full (all bits sets)? + * int cpus_weight(mask) Hamming weigh - number of set bits + * + * void cpus_shift_right(dst, src, n) Shift right + * void cpus_shift_left(dst, src, n) Shift left + * + * int first_cpu(mask) Number lowest set bit, or NR_CPUS + * int next_cpu(cpu, mask) Next cpu past 'cpu', or NR_CPUS + * + * cpumask_t cpumask_of_cpu(cpu) Return cpumask with bit 'cpu' set + * CPU_MASK_ALL Initializer - all bits set + * CPU_MASK_NONE Initializer - no bits set + * unsigned long *cpus_addr(mask) Array of unsigned long's in mask + * + * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing + * int cpumask_parse_user(ubuf, ulen, mask) Parse ascii string as cpumask + * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing + * int cpulist_parse(buf, map) Parse ascii string as cpulist + * int cpu_remap(oldbit, old, new) newbit = map(old, new)(oldbit) + * int cpus_remap(dst, src, old, new) *dst = map(old, new)(src) + * + * for_each_cpu_mask(cpu, mask) for-loop cpu over mask + * + * int num_online_cpus() Number of online CPUs + * + * int cpu_online(cpu) Is some cpu online? + * + * for_each_online_cpu(cpu) for-loop cpu over cpu_online_map + * + * Subtlety: + * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway) + * to generate slightly worse code. Note for example the additional + * 40 lines of assembly code compiling the "for each possible cpu" + * loops buried in the disk_stat_read() macros calls when compiling + * drivers/block/genhd.c (arch i386, CONFIG_SMP=y). So use a simple + * one-line #define for cpu_isset(), instead of wrapping an inline + * inside a macro, the way we do the other calls. + */ + +#include "bitmap.h" + +typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; +extern cpumask_t _unused_cpumask_arg_; + +#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) +static inline void __cpu_set(int cpu, volatile cpumask_t *dstp) +{ + set_bit(cpu, dstp->bits); +} + +#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst)) +static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp) +{ + clear_bit(cpu, dstp->bits); +} + +#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS) +static inline void __cpus_setall(cpumask_t *dstp, int nbits) +{ + bitmap_fill(dstp->bits, nbits); +} + +#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS) +static inline void __cpus_clear(cpumask_t *dstp, int nbits) +{ + bitmap_zero(dstp->bits, nbits); +} + +/* No static inline type checking - see Subtlety (1) above. */ +#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits) + +#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_andnot(dst, src1, src2) \ + __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS) +static inline void __cpus_complement(cpumask_t *dstp, + const cpumask_t *srcp, int nbits) +{ + bitmap_complement(dstp->bits, srcp->bits, nbits); +} + +#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS) +static inline int __cpus_equal(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_equal(src1p->bits, src2p->bits, nbits); +} + +#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS) +static inline int __cpus_intersects(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_intersects(src1p->bits, src2p->bits, nbits); +} + +#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS) +static inline int __cpus_subset(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_subset(src1p->bits, src2p->bits, nbits); +} + +#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS) +static inline int __cpus_empty(const cpumask_t *srcp, int nbits) +{ + return bitmap_empty(srcp->bits, nbits); +} + +#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS) +static inline int __cpus_full(const cpumask_t *srcp, int nbits) +{ + return bitmap_full(srcp->bits, nbits); +} + +#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS) +static inline int __cpus_weight(const cpumask_t *srcp, int nbits) +{ + return bitmap_weight(srcp->bits, nbits); +} + +#define cpus_shift_right(dst, src, n) \ + __cpus_shift_right(&(dst), &(src), (n), NR_CPUS) +static inline void __cpus_shift_right(cpumask_t *dstp, + const cpumask_t *srcp, int n, int nbits) +{ + bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); +} + +#define cpus_shift_left(dst, src, n) \ + __cpus_shift_left(&(dst), &(src), (n), NR_CPUS) +static inline void __cpus_shift_left(cpumask_t *dstp, + const cpumask_t *srcp, int n, int nbits) +{ + bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); +} + +static inline int __first_cpu(const cpumask_t *srcp) +{ + return ffs(*srcp->bits)-1; +} + +#define first_cpu(src) __first_cpu(&(src)) +int __next_cpu(int n, const cpumask_t *srcp); +#define next_cpu(n, src) __next_cpu((n), &(src)) + +#define cpumask_of_cpu(cpu) \ +({ \ + typeof(_unused_cpumask_arg_) m; \ + if (sizeof(m) == sizeof(unsigned long)) { \ + m.bits[0] = 1UL<<(cpu); \ + } else { \ + cpus_clear(m); \ + cpu_set((cpu), m); \ + } \ + m; \ +}) + +#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) + +#if 0 + +#define CPU_MASK_ALL \ +(cpumask_t) { { \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} } + +#else + +#define CPU_MASK_ALL \ +(cpumask_t) { { \ + [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} } + +#endif + +#define CPU_MASK_NONE \ +(cpumask_t) { { \ + [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ +} } + +#define CPU_MASK_CPU0 \ +(cpumask_t) { { \ + [0] = 1UL \ +} } + +#define cpus_addr(src) ((src).bits) + +#define cpumask_scnprintf(buf, len, src) \ + __cpumask_scnprintf((buf), (len), &(src), NR_CPUS) +static inline int __cpumask_scnprintf(char *buf, int len, + const cpumask_t *srcp, int nbits) +{ + return bitmap_scnprintf(buf, len, srcp->bits, nbits); +} + +#define cpumask_parse_user(ubuf, ulen, dst) \ + __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) +static inline int __cpumask_parse_user(const char *buf, int len, + cpumask_t *dstp, int nbits) +{ + return bitmap_parse(buf, len, dstp->bits, nbits); +} + +#define cpulist_scnprintf(buf, len, src) \ + __cpulist_scnprintf((buf), (len), &(src), NR_CPUS) +static inline int __cpulist_scnprintf(char *buf, int len, + const cpumask_t *srcp, int nbits) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); +} + +#define cpulist_parse(buf, len, dst) __cpulist_parse((buf), (len), &(dst), NR_CPUS) +static inline int __cpulist_parse(const char *buf, int len, cpumask_t *dstp, int nbits) +{ + return bitmap_parselist(buf, len, dstp->bits, nbits); +} + +#define cpu_remap(oldbit, old, new) \ + __cpu_remap((oldbit), &(old), &(new), NR_CPUS) +static inline int __cpu_remap(int oldbit, + const cpumask_t *oldp, const cpumask_t *newp, int nbits) +{ + return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); +} + +#define cpus_remap(dst, src, old, new) \ + __cpus_remap(&(dst), &(src), &(old), &(new), NR_CPUS) +static inline void __cpus_remap(cpumask_t *dstp, const cpumask_t *srcp, + const cpumask_t *oldp, const cpumask_t *newp, int nbits) +{ + bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); +} + +#if NR_CPUS > 1 +#define for_each_cpu_mask(cpu, mask) \ + for ((cpu) = first_cpu(mask); \ + (cpu) < NR_CPUS; \ + (cpu) = next_cpu((cpu), (mask))) +#else /* NR_CPUS == 1 */ +#define for_each_cpu_mask(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#endif /* NR_CPUS */ + +/* + * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler + */ +extern cpumask_t cpu_online_map; + +#if NR_CPUS > 1 +#define num_online_cpus() cpus_weight(cpu_online_map) +#define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) +#else +#define num_online_cpus() 1 +#define cpu_online(cpu) ((cpu) == 0) +#endif + +#define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) + +#endif /* __LINUX_CPUMASK_H */
diff --git a/cputree.c b/cputree.c new file mode 100644 index 0000000..bef1f40 --- /dev/null +++ b/cputree.c
@@ -0,0 +1,559 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +/* + * This file contains the code to construct and manipulate a hierarchy of processors, + * cache domains and processor cores. + */ + +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> + +#include <glib.h> + +#include "irqbalance.h" + +extern char *banned_cpumask_from_ui; +extern char *cpu_ban_string; + +GList *cpus; +GList *cache_domains; +GList *packages; + +int cache_domain_count; + +/* Users want to be able to keep interrupts away from some cpus; store these in a cpumask_t */ +cpumask_t banned_cpus; + +cpumask_t cpu_online_map; + +/* + it's convenient to have the complement of banned_cpus available so that + the AND operator can be used to mask out unwanted cpus +*/ +cpumask_t unbanned_cpus; + +int process_one_line(char *path, void (*cb)(char *line, void *data), void *data) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + int ret = -1; + + file = fopen(path, "r"); + if (!file) + return ret; + + if (getline(&line, &size, file) > 0) { + cb(line, data); + ret = 0; + } + free(line); + fclose(file); + return ret; +} + +void get_hex(char *line, void *data) +{ + *(int *)data = strtoul(line, NULL, 16); +} + +void get_int(char *line, void *data) +{ + *(int *)data = strtoul(line, NULL, 10); +} + +void get_mask_from_bitmap(char *line, void *mask) +{ + cpumask_parse_user(line, strlen(line), *(cpumask_t *)mask); +} + +static void get_mask_from_cpulist(char *line, void *mask) +{ + if (strlen(line) && line[0] != '\n') + cpulist_parse(line, strlen(line), *(cpumask_t *)mask); +} + +/* + * By default do not place IRQs on CPUs the kernel keeps isolated or + * nohz_full, as specified through the boot commandline. Users can + * override this with the IRQBALANCE_BANNED_CPUS environment variable. + */ +static void setup_banned_cpus(void) +{ + char *path = NULL; + char buffer[4096]; + cpumask_t nohz_full; + cpumask_t isolated_cpus; + + cpus_clear(isolated_cpus); + cpus_clear(nohz_full); + + /* A manually specified cpumask overrides auto-detection. */ + if (cpu_ban_string != NULL && banned_cpumask_from_ui != NULL) { + cpulist_parse(banned_cpumask_from_ui, + strlen(banned_cpumask_from_ui), banned_cpus); + goto out; + } + if (getenv("IRQBALANCE_BANNED_CPUS")) { + cpumask_parse_user(getenv("IRQBALANCE_BANNED_CPUS"), strlen(getenv("IRQBALANCE_BANNED_CPUS")), banned_cpus); + goto out; + } + + path = "/sys/devices/system/cpu/isolated"; + process_one_line(path, get_mask_from_cpulist, &isolated_cpus); + + path = "/sys/devices/system/cpu/nohz_full"; + process_one_line(path, get_mask_from_cpulist, &nohz_full); + + cpus_or(banned_cpus, nohz_full, isolated_cpus); + +out: + cpumask_scnprintf(buffer, 4096, isolated_cpus); + log(TO_CONSOLE, LOG_INFO, "Isolated CPUs: %s\n", buffer); + cpumask_scnprintf(buffer, 4096, nohz_full); + log(TO_CONSOLE, LOG_INFO, "Adaptive-ticks CPUs: %s\n", buffer); + cpumask_scnprintf(buffer, 4096, banned_cpus); + log(TO_CONSOLE, LOG_INFO, "Banned CPUs: %s\n", buffer); +} + +static void add_numa_node_to_topo_obj(struct topo_obj *obj, int nodeid) +{ + GList *entry; + struct topo_obj *node; + + node = get_numa_node(nodeid); + if (!node || (numa_avail && (node->number == NUMA_NO_NODE))) + return; + + entry = g_list_find(obj->numa_nodes, node); + if (!entry) + obj->numa_nodes = g_list_append(obj->numa_nodes, node); + + if (!numa_avail && obj->obj_type == OBJ_TYPE_PACKAGE) { + entry = g_list_find(node->children, obj); + if (!entry) { + node->children = g_list_append(node->children, obj); + obj->parent = node; + } + } +} + +static struct topo_obj* add_cache_domain_to_package(struct topo_obj *cache, + int packageid, + cpumask_t package_mask, + int nodeid) +{ + GList *entry; + struct topo_obj *package; + + entry = g_list_first(packages); + + while (entry) { + package = entry->data; + if (cpus_equal(package_mask, package->mask)) { + if (packageid != package->number) + log(TO_ALL, LOG_WARNING, "package_mask with different physical_package_id found!\n"); + break; + } + entry = g_list_next(entry); + } + + if (!entry) { + package = calloc(1, sizeof(struct topo_obj)); + if (!package) { + need_rebuild = 1; + return NULL; + } + package->mask = package_mask; + package->obj_type = OBJ_TYPE_PACKAGE; + package->obj_type_list = &packages; + package->number = packageid; + packages = g_list_append(packages, package); + } + + entry = g_list_find(package->children, cache); + if (!entry) { + package->children = g_list_append(package->children, cache); + cache->parent = package; + } + + if (!numa_avail || (nodeid > NUMA_NO_NODE)) + add_numa_node_to_topo_obj(package, nodeid); + + return package; +} +static struct topo_obj* add_cpu_to_cache_domain(struct topo_obj *cpu, + cpumask_t cache_mask, + int nodeid) +{ + GList *entry; + struct topo_obj *cache; + + entry = g_list_first(cache_domains); + + while (entry) { + cache = entry->data; + if (cpus_equal(cache_mask, cache->mask)) + break; + entry = g_list_next(entry); + } + + if (!entry) { + cache = calloc(1, sizeof(struct topo_obj)); + if (!cache) { + need_rebuild = 1; + return NULL; + } + cache->obj_type = OBJ_TYPE_CACHE; + cache->mask = cache_mask; + cache->number = cache_domain_count; + cache->obj_type_list = &cache_domains; + cache_domains = g_list_append(cache_domains, cache); + cache_domain_count++; + } + + entry = g_list_find(cache->children, cpu); + if (!entry) { + cache->children = g_list_append(cache->children, cpu); + cpu->parent = (struct topo_obj *)cache; + } + + if (!numa_avail || (nodeid > NUMA_NO_NODE)) + add_numa_node_to_topo_obj(cache, nodeid); + + return cache; +} + +#define ADJ_SIZE(r,s) PATH_MAX-strlen(r)-strlen(#s) +static void do_one_cpu(char *path) +{ + struct topo_obj *cpu; + char new_path[PATH_MAX]; + cpumask_t cache_mask, package_mask; + struct topo_obj *cache; + DIR *dir; + struct dirent *entry; + int nodeid; + int packageid = 0; + unsigned int max_cache_index, cache_index, cache_stat; + int online_status = 1; + + /* skip offline cpus */ + snprintf(new_path, ADJ_SIZE(path,"/online"), "%s/online", path); + process_one_line(new_path, get_int, &online_status); + if (!online_status) + return; + + cpu = calloc(1, sizeof(struct topo_obj)); + if (!cpu) { + need_rebuild = 1; + return; + } + + cpu->obj_type = OBJ_TYPE_CPU; + + cpu->number = strtoul(&path[27], NULL, 10); + + cpu_set(cpu->number, cpu_online_map); + + cpu_set(cpu->number, cpu->mask); + + /* + * Default the cache_domain mask to be equal to the cpu + */ + cpus_clear(cache_mask); + cpu_set(cpu->number, cache_mask); + + /* if the cpu is on the banned list, just don't add it */ + if (cpus_intersects(cpu->mask, banned_cpus)) { + free(cpu); + return; + } + + /* try to read the package mask; if it doesn't exist assume solitary */ + snprintf(new_path, ADJ_SIZE(path, "/topology/core_siblings"), + "%s/topology/core_siblings", path); + if (process_one_line(new_path, get_mask_from_bitmap, &package_mask)) { + cpus_clear(package_mask); + cpu_set(cpu->number, package_mask); + } + + /* try to read the package id */ + snprintf(new_path, ADJ_SIZE(path, "/topology/physical_package_id"), + "%s/topology/physical_package_id", path); + process_one_line(new_path, get_int, &packageid); + + /* try to read the cache mask; if it doesn't exist assume solitary */ + /* We want the deepest cache level available */ + max_cache_index = 0; + cache_index = 1; + do { + struct stat sb; + /* Extra 10 subtraction is for the max character length of %d */ + snprintf(new_path, ADJ_SIZE(path, "/cache/index%d/shared_cpu_map") - 10, + "%s/cache/index%d/shared_cpu_map", path, cache_index); + cache_stat = stat(new_path, &sb); + if (!cache_stat) { + max_cache_index = cache_index; + if (max_cache_index == deepest_cache) + break; + cache_index ++; + } + } while(!cache_stat); + + if (max_cache_index > 0) { + /* Extra 10 subtraction is for the max character length of %d */ + snprintf(new_path, ADJ_SIZE(path, "/cache/index%d/shared_cpu_map") - 10, + "%s/cache/index%d/shared_cpu_map", path, max_cache_index); + process_one_line(new_path, get_mask_from_bitmap, &cache_mask); + } + + nodeid = NUMA_NO_NODE; + if (numa_avail) { + struct topo_obj *node; + + dir = opendir(path); + do { + entry = readdir(dir); + if (!entry) + break; + if (strncmp(entry->d_name, "node", 4) == 0) { + char *end; + int num; + num = strtol(entry->d_name + 4, &end, 10); + if (!*end && num >= 0) { + nodeid = num; + break; + } + } + } while (entry); + closedir(dir); + + /* + * In case of multiple NUMA nodes within a CPU package, + * we override package_mask with node mask. + */ + node = get_numa_node(nodeid); + if (node && (cpus_weight(package_mask) > cpus_weight(node->mask))) + cpus_and(package_mask, package_mask, node->mask); + } + + /* + blank out the banned cpus from the various masks so that interrupts + will never be told to go there + */ + cpus_and(cache_mask, cache_mask, unbanned_cpus); + cpus_and(package_mask, package_mask, unbanned_cpus); + + cache = add_cpu_to_cache_domain(cpu, cache_mask, nodeid); + if (cache) + add_cache_domain_to_package(cache, packageid, package_mask, nodeid); + + cpu->obj_type_list = &cpus; + cpus = g_list_append(cpus, cpu); +} + +static void dump_irq(struct irq_info *info, void *data) +{ + int spaces = (long int)data; + int i; + char * indent = malloc (sizeof(char) * (spaces + 1)); + + if (!indent) + return; + for ( i = 0; i < spaces; i++ ) + indent[i] = log_indent[0]; + + indent[i] = '\0'; + log(TO_CONSOLE, LOG_INFO, "%sInterrupt %i node_num is %d (%s/%lu:%lu) \n", indent, + info->irq, irq_numa_node(info)->number, classes[info->class], info->load, (info->irq_count - info->last_irq_count)); + free(indent); +} + +static void dump_numa_node_num(struct topo_obj *p, void *data __attribute__((unused))) +{ + log(TO_CONSOLE, LOG_INFO, "%d ", p->number); +} + +static void dump_balance_obj(struct topo_obj *d, void *data __attribute__((unused))) +{ + struct topo_obj *c = (struct topo_obj *)d; + log(TO_CONSOLE, LOG_INFO, "%s%s%s%sCPU number %i numa_node is ", + log_indent, log_indent, log_indent, log_indent, c->number); + for_each_object(cpu_numa_node(c), dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "(load %lu)\n", (unsigned long)c->load); + if (c->interrupts) + for_each_irq(c->interrupts, dump_irq, (void *)18); +} + +static void dump_cache_domain(struct topo_obj *d, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4095, d->mask); + log(TO_CONSOLE, LOG_INFO, "%s%sCache domain %i: numa_node is ", + log_indent, log_indent, d->number); + for_each_object(d->numa_nodes, dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu) \n", buffer, + (unsigned long)d->load); + if (d->children) + for_each_object(d->children, dump_balance_obj, NULL); + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, dump_irq, (void *)10); +} + +static void dump_package(struct topo_obj *d, void *data) +{ + char *buffer = data; + cpumask_scnprintf(buffer, 4096, d->mask); + log(TO_CONSOLE, LOG_INFO, "Package %i: numa_node ", d->number); + for_each_object(d->numa_nodes, dump_numa_node_num, NULL); + log(TO_CONSOLE, LOG_INFO, "cpu mask is %s (load %lu)\n", + buffer, (unsigned long)d->load); + if (d->children) + for_each_object(d->children, dump_cache_domain, buffer); + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, dump_irq, (void *)2); +} + +void dump_tree(void) +{ + char buffer[4096]; + for_each_object(packages, dump_package, buffer); +} + +static void clear_irq_stats(struct irq_info *info, void *data __attribute__((unused))) +{ + info->load = 0; +} + +static void clear_obj_stats(struct topo_obj *d, void *data __attribute__((unused))) +{ + for_each_object(d->children, clear_obj_stats, NULL); + for_each_irq(d->interrupts, clear_irq_stats, NULL); +} + +/* + * this function removes previous state from the cpu tree, such as + * which level does how much work and the actual lists of interrupts + * assigned to each component + */ +void clear_work_stats(void) +{ + for_each_object(numa_nodes, clear_obj_stats, NULL); +} + + +void parse_cpu_tree(void) +{ + DIR *dir; + struct dirent *entry; + + setup_banned_cpus(); + + cpus_complement(unbanned_cpus, banned_cpus); + + dir = opendir("/sys/devices/system/cpu"); + if (!dir) + return; + do { + int num; + char pad; + entry = readdir(dir); + /* + * We only want to count real cpus, not cpufreq and + * cpuidle + */ + if (entry && + sscanf(entry->d_name, "cpu%d%c", &num, &pad) == 1 && + !strchr(entry->d_name, ' ')) { + char new_path[PATH_MAX]; + snprintf(new_path, PATH_MAX, "/sys/devices/system/cpu/%s", entry->d_name); + do_one_cpu(new_path); + } + } while (entry); + closedir(dir); + for_each_object(packages, connect_cpu_mem_topo, NULL); + + if (debug_mode) + dump_tree(); + +} + +void free_cpu_topo(gpointer data) +{ + struct topo_obj *obj = data; + + g_list_free(obj->children); + g_list_free(obj->interrupts); + g_list_free(obj->numa_nodes); + free(obj); +} + +/* + * This function frees all memory related to a cpu tree so that a new tree + * can be read + */ +void clear_cpu_tree(void) +{ + g_list_free_full(packages, free_cpu_topo); + packages = NULL; + + g_list_free_full(cache_domains, free_cpu_topo); + cache_domains = NULL; + cache_domain_count = 0; + + g_list_free_full(cpus, free_cpu_topo); + cpus = NULL; + cpus_clear(cpu_online_map); +} + +static gint compare_cpus(gconstpointer a, gconstpointer b) +{ + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; + + return ai->number - bi->number; +} + +struct topo_obj *find_cpu_core(int cpunr) +{ + GList *entry; + struct topo_obj find; + + find.number = cpunr; + entry = g_list_find_custom(cpus, &find, compare_cpus); + + return entry ? entry->data : NULL; +} + +int get_cpu_count(void) +{ + return g_list_length(cpus); +} +
diff --git a/irqbalance-ui.1 b/irqbalance-ui.1 new file mode 100644 index 0000000..7ca4d28 --- /dev/null +++ b/irqbalance-ui.1
@@ -0,0 +1,40 @@ +.de Sh \" Subsection +.br +.if t .Sp +.ne 5 +.PP +\fB\\$1\fR +.PP +.. +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Ip \" List item +.br +.ie \\n(.$>=3 .ne \\$3 +.el .ne 3 +.IP "\\$1" \\$2 +.. +.TH "IRQBALANCE-UI" 1 "Jul 2019" "Linux" "irqbalance-ui" +.SH NAME +irqbalance-ui \- user interface for irqbalance +.SH "SYNOPSIS" + +.nf +\fBirqbalance-ui\fR +.fi + +.SH "DESCRIPTION" + +.PP +\fBirqbalance-ui\fR provides an ncurses-based textual user interface to +\fBirqbalance\fR, a daemon responsible for IRQ distribution on Linux systems. +It shows how IRQs are distributed over CPUs at given moment, and allows one to +setup of the sleep interval and the IRQ and CPU banning at runtime. + +.SH "HOMEPAGE" +https://github.com/Irqbalance/irqbalance + +.SH "SEE ALSO" +irqbalance(1)
diff --git a/irqbalance.1 b/irqbalance.1 new file mode 100644 index 0000000..3005f6b --- /dev/null +++ b/irqbalance.1
@@ -0,0 +1,197 @@ +.de Sh \" Subsection +.br +.if t .Sp +.ne 5 +.PP +\fB\\$1\fR +.PP +.. +.de Sp \" Vertical space (when we can't use .PP) +.if t .sp .5v +.if n .sp +.. +.de Ip \" List item +.br +.ie \\n(.$>=3 .ne \\$3 +.el .ne 3 +.IP "\\$1" \\$2 +.. +.TH "IRQBALANCE" 1 "Dec 2006" "Linux" "irqbalance" +.SH NAME +irqbalance \- distribute hardware interrupts across processors on a multiprocessor system +.SH "SYNOPSIS" + +.nf +\fBirqbalance\fR +.fi + +.SH "DESCRIPTION" + +.PP +The purpose of \fBirqbalance\fR is to distribute hardware interrupts across +processors on a multiprocessor system in order to increase performance\&. + +.SH "OPTIONS" + +.TP +.B -o, --oneshot +Causes irqbalance to be run once, after which the daemon exits. +.TP + +.B -d, --debug +Causes irqbalance to print extra debug information. Implies --foreground. + +.TP +.B -f, --foreground +Causes irqbalance to run in the foreground (without --debug). + +.TP +.B -j, --journal +Enables log output optimized for systemd-journal. + +.TP +.B -p, --powerthresh=<threshold> +Set the threshold at which we attempt to move a CPU into powersave mode +If more than <threshold> CPUs are more than 1 standard deviation below the +average CPU softirq workload, and no CPUs are more than 1 standard deviation +above (and have more than 1 IRQ assigned to them), attempt to place 1 CPU in +powersave mode. In powersave mode, a CPU will not have any IRQs balanced to it, +in an effort to prevent that CPU from waking up without need. + +.TP +.B -i, --banirq=<irqnum> +Add the specified IRQ to the set of banned IRQs. irqbalance will not affect +the affinity of any IRQs on the banned list, allowing them to be specified +manually. This option is additive and can be specified multiple times. For +example to ban IRQs 43 and 44 from balancing, use the following command line: +.B irqbalance --banirq=43 --banirq=44 + +.TP +.B -m, --banmod=<module_name> +Add the specified module to the set of banned modules, similar to --banirq. +irqbalance will not affect the affinity of any IRQs of given modules, allowing +them to be specified manually. This option is additive and can be specified +multiple times. For example to ban all IRQs of module foo and module bar from +balancing, use the following command line: +.B irqbalance --banmod=foo --banmod=bar + +.TP +.B -c, --deepestcache=<integer> +This allows a user to specify the cache level at which irqbalance partitions +cache domains. Specifying a deeper cache may allow a greater degree of +flexibility for irqbalance to assign IRQ affinity to achieve greater performance +increases, but setting a cache depth too large on some systems (specifically +where all CPUs on a system share the deepest cache level), will cause irqbalance +to see balancing as unnecessary. +.B irqbalance --deepestcache=2 +.P +The default value for deepestcache is 2. + +.TP +.B -l, --policyscript=<script> +When specified, the referenced script or directory will execute once for each discovered IRQ, +with the sysfs device path and IRQ number passed as arguments. Note that the +device path argument will point to the parent directory from which the IRQ +attributes directory may be directly opened. +Policy scripts specified need to be owned and executable by the user of irqbalance process, +if a directory is specified, non-executable files will be skipped. +The script may specify zero or more key=value pairs that will guide irqbalance in +the management of that IRQ. Key=value pairs are printed by the script on stdout +and will be captured and interpreted by irqbalance. Irqbalance expects a zero +exit code from the provided utility. Recognized key=value pairs are: +.TP +.I ban=[true | false] +Directs irqbalance to exclude the passed in IRQ from balancing. +.TP +.I balance_level=[none | package | cache | core] +This allows a user to override the balance level of a given IRQ. By default the +balance level is determined automatically based on the pci device class of the +device that owns the IRQ. +.TP +.I numa_node=<integer> +This allows a user to override the NUMA node that sysfs indicates a given device +IRQ is local to. Often, systems will not specify this information in ACPI, and as a +result devices are considered equidistant from all NUMA nodes in a system. +This option allows for that hardware provided information to be overridden, so +that irqbalance can bias IRQ affinity for these devices toward its most local +node. Note that specifying a -1 here forces irqbalance to consider an interrupt +from a device to be equidistant from all nodes. +.TP +Note that, if a directory is specified rather than a regular file, all files in +the directory will be considered policy scripts, and executed on adding of an +irq to a database. If such a directory is specified, scripts in the directory +must additionally exit with one of the following exit codes: +.TP +.I 0 +This indicates the script has a policy for the referenced irq, and that further +script processing should stop +.TP +.I 1 +This indicates that the script has no policy for the referenced irq, and that +script processing should continue +.TP +.I 2 +This indicates that an error has occurred in the script, and it should be skipped +(further processing to continue) + +.TP +.B -s, --pid=<file> +Have irqbalance write its process id to the specified file. By default no +pidfile is written. The written pidfile is automatically unlinked when +irqbalance exits. It is ignored when used with --debug or --foreground. +.TP +.B -t, --interval=<time> +Set the measurement time for irqbalance. irqbalance will sleep for <time> +seconds between samples of the irq load on the system cpus. Defaults to 10. +.SH "ENVIRONMENT VARIABLES" +.TP +.B IRQBALANCE_ONESHOT +Same as --oneshot. + +.TP +.B IRQBALANCE_DEBUG +Same as --debug. + +.TP +.B IRQBALANCE_BANNED_CPUS +Provides a mask of CPUs which irqbalance should ignore and never assign interrupts to. +If not specified, irqbalance use mask of isolated and adaptive-ticks CPUs on the +system as the default value. + +.SH "SIGNALS" +.TP +.B SIGHUP +Forces a rescan of the available IRQs and system topology. + +.SH "API" +irqbalance is able to communicate via socket and return it's current assignment +tree and setup, as well as set new settings based on sent values. Socket is abstract, +with a name in form of +.B irqbalance<PID>.sock +, where <PID> is the process ID of irqbalance instance to communicate with. +Possible values to send: +.TP +.B stats +Retrieve assignment tree of IRQs to CPUs, in recursive manner. For each CPU node +in tree, it's type, number, load and whether the save mode is active are sent. For +each assigned IRQ type, it's number, load, number of IRQs since last rebalancing +and it's class are sent. Refer to types.h file for explanation of defines. +.TP +.B setup +Get the current value of sleep interval, mask of banned CPUs and list of banned IRQs. +.TP +.B settings sleep <s> +Set new value of sleep interval, <s> >= 1. +.TP +.B settings cpus <cpu_number1> <cpu_number2> ... +Ban listed CPUs from IRQ handling, all old values of banned CPUs are forgotten. +.TP +.B settings ban irqs <irq1> <irq2> ... +Ban listed IRQs from being balanced, all old values of banned IRQs are forgotten. +.PP +irqbalance checks SCM_CREDENTIALS of sender (only root user is allowed to interact). +Based on chosen tools, ancillary message with credentials needs to be sent with request. + +.SH "HOMEPAGE" +https://github.com/Irqbalance/irqbalance +
diff --git a/irqbalance.c b/irqbalance.c new file mode 100644 index 0000000..dc6e17e --- /dev/null +++ b/irqbalance.c
@@ -0,0 +1,718 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ +#include "config.h" +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <sys/time.h> +#include <syslog.h> +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <fcntl.h> +#ifdef HAVE_GETOPT_LONG +#include <getopt.h> +#endif + +#ifdef HAVE_LIBCAP_NG +#include <cap-ng.h> +#endif +#include "irqbalance.h" + +volatile int keep_going = 1; +int socket_fd; +char socket_name[64]; +int one_shot_mode; +int debug_mode; +int foreground_mode; +int numa_avail; +int journal_logging = 0; +int need_rescan; +int need_rebuild; +unsigned int log_mask = TO_ALL; +const char *log_indent; +unsigned long power_thresh = ULONG_MAX; +unsigned long deepest_cache = 2; +unsigned long long cycle_count = 0; +char *pidfile = NULL; +char *polscript = NULL; +long HZ; +int sleep_interval = SLEEP_INTERVAL; +int last_interval; +GMainLoop *main_loop; + +char *cpu_ban_string = NULL; +char *banned_cpumask_from_ui = NULL; +unsigned long migrate_ratio = 0; + +static void sleep_approx(int seconds) +{ + struct timespec ts; + struct timeval tv; + gettimeofday(&tv, NULL); + ts.tv_sec = seconds; + ts.tv_nsec = -tv.tv_usec*1000; + while (ts.tv_nsec < 0) { + ts.tv_sec--; + ts.tv_nsec += 1000000000; + } + nanosleep(&ts, NULL); +} + +#ifdef HAVE_GETOPT_LONG +struct option lopts[] = { + {"oneshot", 0, NULL, 'o'}, + {"debug", 0, NULL, 'd'}, + {"foreground", 0, NULL, 'f'}, + {"powerthresh", 1, NULL, 'p'}, + {"banirq", 1 , NULL, 'i'}, + {"deepestcache", 1, NULL, 'c'}, + {"policyscript", 1, NULL, 'l'}, + {"pid", 1, NULL, 's'}, + {"journal", 0, NULL, 'j'}, + {"banmod", 1 , NULL, 'm'}, + {"interval", 1 , NULL, 't'}, + {"version", 0, NULL, 'V'}, + {"migrateval", 1, NULL, 'e'}, + {0, 0, 0, 0} +}; + +static void usage(void) +{ + log(TO_CONSOLE, LOG_INFO, "irqbalance [--oneshot | -o] [--debug | -d] [--foreground | -f] [--journal | -j]\n"); + log(TO_CONSOLE, LOG_INFO, " [--powerthresh= | -p <off> | <n>] [--banirq= | -i <n>] [--banmod= | -m <module>] [--policyscript= | -l <script>]\n"); + log(TO_CONSOLE, LOG_INFO, " [--pid= | -s <file>] [--deepestcache= | -c <n>] [--interval= | -t <n>] [--migrateval= | -e <n>]\n"); +} + +static void version(void) +{ + log(TO_CONSOLE, LOG_INFO, "irqbalance version " VERSION "\n"); +} + +static void parse_command_line(int argc, char **argv) +{ + int opt; + int longind; + unsigned long val; + + while ((opt = getopt_long(argc, argv, + "odfjVi:p:s:c:l:m:t:e:", + lopts, &longind)) != -1) { + + switch(opt) { + case '?': + usage(); + exit(1); + break; + case 'V': + version(); + exit(1); + break; + case 'c': + deepest_cache = strtoul(optarg, NULL, 10); + if (deepest_cache == ULONG_MAX || deepest_cache < 1) { + usage(); + exit(1); + } + break; + case 'd': + debug_mode=1; + foreground_mode=1; + break; + case 'f': + foreground_mode=1; + break; + case 'i': + val = strtoull(optarg, NULL, 10); + if (val == ULONG_MAX) { + usage(); + exit(1); + } + add_cl_banned_irq((int)val); + break; + case 'l': + free(polscript); + polscript = strdup(optarg); + break; + case 'm': + add_cl_banned_module(optarg); + break; + case 'p': + if (!strncmp(optarg, "off", strlen(optarg))) + power_thresh = ULONG_MAX; + else { + power_thresh = strtoull(optarg, NULL, 10); + if (power_thresh == ULONG_MAX) { + usage(); + exit(1); + } + } + break; + case 'o': + one_shot_mode=1; + break; + case 's': + pidfile = optarg; + break; + case 'j': + journal_logging=1; + foreground_mode=1; + break; + case 't': + sleep_interval = strtol(optarg, NULL, 10); + if (sleep_interval < 1) { + usage(); + exit(1); + } + break; + case 'e': + migrate_ratio = strtoul(optarg, NULL, 10); + break; + } + } +} +#endif + +/* + * This builds our object tree. The Hierarchy is typically pretty + * straightforward. + * At the top are numa_nodes + * CPU packages belong to a single numa_node, unless the cache domains are in + * separate nodes. In that case, the cache domain's parent is the package, but + * the numa nodes point to the cache domains instead of the package as their + * children. This allows us to maintain the CPU hierarchy while adjusting for + * alternate memory topologies that are present on recent processor. + * All Cache domains belong to a CPU package + * All CPU cores belong to a cache domain + * + * Objects are built in that order (top down) + * + * Object workload is the aggregate sum of the + * workload of the objects below it + */ +static void build_object_tree(void) +{ + build_numa_node_list(); + parse_cpu_tree(); + rebuild_irq_db(); +} + +static void free_object_tree(void) +{ + free_numa_node_list(); + clear_cpu_tree(); + free_irq_db(); +} + +static void dump_object_tree(void) +{ + for_each_object(numa_nodes, dump_numa_node_info, NULL); +} + +static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused))) +{ + if (info->level == BALANCE_NONE) + return; + + if (info->assigned_obj == NULL) + rebalance_irq_list = g_list_append(rebalance_irq_list, info); + else + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + + info->assigned_obj = NULL; +} + +gboolean handler(gpointer data __attribute__((unused))) +{ + keep_going = 0; + g_main_loop_quit(main_loop); + return TRUE; +} + +gboolean force_rescan(gpointer data __attribute__((unused))) +{ + if (cycle_count) + need_rescan = 1; + return TRUE; +} + +gboolean scan(gpointer data __attribute__((unused))) +{ + log(TO_CONSOLE, LOG_INFO, "\n\n\n-----------------------------------------------------------------------------\n"); + clear_work_stats(); + parse_proc_interrupts(); + + + /* cope with cpu hotplug -- detected during /proc/interrupts parsing */ + if (need_rescan || need_rebuild) { + int try_times = 0; + + need_rescan = 0; + cycle_count = 0; + log(TO_CONSOLE, LOG_INFO, "Rescanning cpu topology \n"); + clear_work_stats(); + + do { + free_object_tree(); + if (++try_times > 3) { + log(TO_CONSOLE, LOG_WARNING, "Rescanning cpu topology: fail\n"); + goto out; + } + + need_rebuild = 0; + build_object_tree(); + } while (need_rebuild); + + for_each_irq(NULL, force_rebalance_irq, NULL); + parse_proc_interrupts(); + parse_proc_stat(); + sleep_approx(sleep_interval); + clear_work_stats(); + parse_proc_interrupts(); + } + + parse_proc_stat(); + + if (cycle_count) + update_migration_status(); + + calculate_placement(); + activate_mappings(); + +out: + if (debug_mode) + dump_tree(); + if (one_shot_mode) + keep_going = 0; + cycle_count++; + + /* sleep_interval may be changed by socket */ + if (last_interval != sleep_interval) { + last_interval = sleep_interval; + g_timeout_add_seconds(sleep_interval, scan, NULL); + return FALSE; + } + + if (keep_going) { + return TRUE; + } else { + g_main_loop_quit(main_loop); + return FALSE; + } +} + +void get_irq_data(struct irq_info *irq, void *data) +{ + char **irqdata = (char **)data; + char *newptr = NULL; + + if (!*irqdata) + newptr = calloc(24 + 1 + 11 + 20 + 20 + 11, 1); + else + newptr = realloc(*irqdata, strlen(*irqdata) + 24 + 1 + 11 + 20 + 20 + 11); + + if (!newptr) + return; + + *irqdata = newptr; + + sprintf(*irqdata + strlen(*irqdata), + "IRQ %d LOAD %lu DIFF %lu CLASS %d ", irq->irq, irq->load, + (irq->irq_count - irq->last_irq_count), irq->class); +} + +void get_object_stat(struct topo_obj *object, void *data) +{ + char **stats = (char **)data; + char *irq_data = NULL; + char *newptr = NULL; + size_t irqdlen; + + if (g_list_length(object->interrupts) > 0) { + for_each_irq(object->interrupts, get_irq_data, &irq_data); + } + + irqdlen = irq_data ? strlen(irq_data) : 0; + /* + * Note, the size in both conditional branches below is made up as follows: + * strlen(irq_data) - self explanitory + * 31 - The size of "TYPE NUMBER LOAD SAVE_MODE " + * 11 - The maximal size of a %d printout + * 20 - The maximal size of a %lu printout + * 1 - The trailing string terminator + * This should be adjusted if the string in the sprintf is changed + */ + if (!*stats) { + newptr = calloc(irqdlen + 31 + 11 + 20 + 11 + 1, 1); + } else { + newptr = realloc(*stats, strlen(*stats) + irqdlen + 31 + 11 + 20 + 11 + 1); + } + + if (!newptr) { + free(irq_data); + return; + } + + *stats = newptr; + + sprintf(*stats + strlen(*stats), "TYPE %d NUMBER %d LOAD %lu SAVE_MODE %d %s", + object->obj_type, object->number, object->load, + object->powersave_mode, irq_data ? irq_data : ""); + free(irq_data); + if (object->obj_type != OBJ_TYPE_CPU) { + for_each_object(object->children, get_object_stat, data); + } +} + +gboolean sock_handle(gint fd, GIOCondition condition, gpointer user_data __attribute__((unused))) +{ + char buff[500]; + int sock; + int recv_size = 0; + int valid_user = 0; + + struct iovec iov = { buff, 500 }; + struct msghdr msg = { 0 }; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = malloc(CMSG_SPACE(sizeof(struct ucred))); + msg.msg_controllen = CMSG_SPACE(sizeof(struct ucred)); + + struct cmsghdr *cmsg; + + if (condition == G_IO_IN) { + sock = accept(fd, NULL, NULL); + if (sock < 0) { + log(TO_ALL, LOG_WARNING, "Connection couldn't be accepted.\n"); + goto out; + } + if ((recv_size = recvmsg(sock, &msg, 0)) < 0) { + log(TO_ALL, LOG_WARNING, "Error while receiving data.\n"); + goto out_close; + } + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) { + log(TO_ALL, LOG_WARNING, "Connection no memory.\n"); + goto out_close; + } + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_CREDENTIALS)) { + struct ucred *credentials = (struct ucred *) CMSG_DATA(cmsg); + if (!credentials->uid) { + valid_user = 1; + } + } + if (!valid_user) { + log(TO_ALL, LOG_INFO, "Permission denied for user to connect to socket.\n"); + goto out_close; + } + + if (!strncmp(buff, "stats", strlen("stats"))) { + char *stats = NULL; + for_each_object(numa_nodes, get_object_stat, &stats); + send(sock, stats, strlen(stats), 0); + free(stats); + } + if (!strncmp(buff, "settings ", strlen("settings "))) { + if (!(strncmp(buff + strlen("settings "), "sleep ", + strlen("sleep ")))) { + char *sleep_string = malloc( + sizeof(char) * (recv_size - strlen("settings sleep "))); + + if (!sleep_string) + goto out_close; + strncpy(sleep_string, buff + strlen("settings sleep "), + recv_size - strlen("settings sleep ")); + int new_iterval = strtoul(sleep_string, NULL, 10); + if (new_iterval >= 1) { + sleep_interval = new_iterval; + } + free(sleep_string); + } else if (!(strncmp(buff + strlen("settings "), "ban irqs ", + strlen("ban irqs ")))) { + char *end; + char *irq_string = malloc( + sizeof(char) * (recv_size - strlen("settings ban irqs "))); + + if (!irq_string) + goto out_close; + strncpy(irq_string, buff + strlen("settings ban irqs "), + recv_size - strlen("settings ban irqs ")); + g_list_free_full(cl_banned_irqs, free); + cl_banned_irqs = NULL; + need_rescan = 1; + if (!strncmp(irq_string, "NONE", strlen("NONE"))) { + free(irq_string); + goto out_close; + } + int irq = strtoul(irq_string, &end, 10); + do { + add_cl_banned_irq(irq); + } while((irq = strtoul(end, &end, 10))); + free(irq_string); + } else if (!(strncmp(buff + strlen("settings "), "cpus ", + strlen("cpus")))) { + banned_cpumask_from_ui = NULL; + free(cpu_ban_string); + cpu_ban_string = NULL; + + cpu_ban_string = malloc( + sizeof(char) * (recv_size - strlen("settings cpus "))); + + if (!cpu_ban_string) + goto out_close; + strncpy(cpu_ban_string, buff + strlen("settings cpus "), + recv_size - strlen("settings cpus ")); + banned_cpumask_from_ui = strtok(cpu_ban_string, " "); + if (!strncmp(banned_cpumask_from_ui, "NULL", strlen("NULL"))) { + banned_cpumask_from_ui = NULL; + free(cpu_ban_string); + cpu_ban_string = NULL; + } + need_rescan = 1; + } + } + if (!strncmp(buff, "setup", strlen("setup"))) { + char banned[512]; + char *setup = calloc(strlen("SLEEP ") + 11 + 1, 1); + char *newptr = NULL; + + if (!setup) + goto out_close; + snprintf(setup, strlen("SLEEP ") + 11 + 1, "SLEEP %d ", sleep_interval); + if(g_list_length(cl_banned_irqs) > 0) { + for_each_irq(cl_banned_irqs, get_irq_data, &setup); + } + cpumask_scnprintf(banned, 512, banned_cpus); + newptr = realloc(setup, strlen(setup) + strlen(banned) + 7 + 1); + if (!newptr) + goto out_free_setup; + + setup = newptr; + snprintf(setup + strlen(setup), strlen(banned) + 7 + 1, + "BANNED %s", banned); + send(sock, setup, strlen(setup), 0); +out_free_setup: + free(setup); + } + +out_close: + close(sock); + } + +out: + free(msg.msg_control); + return TRUE; +} + +int init_socket() +{ + struct sockaddr_un addr; + memset(&addr, 0, sizeof(struct sockaddr_un)); + + socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0); + if (socket_fd < 0) { + log(TO_ALL, LOG_WARNING, "Socket couldn't be created.\n"); + return 1; + } + + /* + * First try to create a file-based socket in tmpfs. If that doesn't + * succeed, fall back to an abstract socket (non file-based). + */ + addr.sun_family = AF_UNIX; + snprintf(socket_name, 64, "%s/%s%d.sock", SOCKET_TMPFS, SOCKET_PATH, getpid()); + strncpy(addr.sun_path, socket_name, sizeof(addr.sun_path)); + if (bind(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + log(TO_ALL, LOG_WARNING, "Daemon couldn't be bound to the file-based socket.\n"); + + /* Try binding to abstract */ + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + if (bind(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + log(TO_ALL, LOG_WARNING, "Daemon couldn't be bound to the abstract socket, bailing out.\n"); + return 1; + } + } + + int optval = 1; + if (setsockopt(socket_fd, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) < 0) { + log(TO_ALL, LOG_WARNING, "Unable to set socket options.\n"); + return 1; + } + listen(socket_fd, 1); + g_unix_fd_add(socket_fd, G_IO_IN, sock_handle, NULL); + return 0; +} + +int main(int argc, char** argv) +{ + sigset_t sigset, old_sigset; + int ret = EXIT_SUCCESS; + + sigemptyset(&sigset); + sigaddset(&sigset,SIGINT); + sigaddset(&sigset,SIGHUP); + sigaddset(&sigset,SIGTERM); + sigaddset(&sigset,SIGUSR1); + sigaddset(&sigset,SIGUSR2); + sigprocmask(SIG_BLOCK, &sigset, &old_sigset); +#ifdef HAVE_GETOPT_LONG + parse_command_line(argc, argv); +#else /* ! HAVE_GETOPT_LONG */ + if (argc>1 && strstr(argv[1],"--debug")) { + debug_mode=1; + foreground_mode=1; + } + if (argc>1 && strstr(argv[1],"--foreground")) + foreground_mode=1; + if (argc>1 && strstr(argv[1],"--oneshot")) + one_shot_mode=1; + if (argc>1 && strstr(argv[1],"--journal")) { + journal_logging=1; + foreground_mode=1; + } +#endif /* HAVE_GETOPT_LONG */ + + /* + * Open the syslog connection + */ + openlog(argv[0], 0, LOG_DAEMON); + setlogmask(LOG_UPTO(LOG_INFO)); + + if (getenv("IRQBALANCE_ONESHOT")) + one_shot_mode=1; + + if (getenv("IRQBALANCE_DEBUG")) { + debug_mode=1; + foreground_mode=1; + } + + /* + * If we are't in debug mode, don't dump anything to the console + * note that everything goes to the console before we check this + */ + if (journal_logging) + log_indent = "...."; + else + log_indent = " "; + + if (!debug_mode) + log_mask &= ~TO_CONSOLE; + + if (numa_available() > -1) { + numa_avail = 1; + } else + log(TO_CONSOLE, LOG_INFO, "This machine seems not NUMA capable.\n"); + + if (geteuid() != 0) + log(TO_ALL, LOG_WARNING, "Irqbalance hasn't been executed under root privileges, thus it won't in fact balance interrupts.\n"); + + HZ = sysconf(_SC_CLK_TCK); + if (HZ == -1) { + log(TO_ALL, LOG_WARNING, "Unable to determine HZ defaulting to 100\n"); + HZ = 100; + } + + if (!foreground_mode) { + int pidfd = -1; + if (daemon(0,0)) + exit(EXIT_FAILURE); + /* Write pidfile which can be used to avoid starting multiple instances */ + if (pidfile && (pidfd = open(pidfile, + O_WRONLY | O_CREAT | O_EXCL | O_TRUNC, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) >= 0) { + char str[16]; + snprintf(str, sizeof(str), "%u\n", getpid()); + write(pidfd, str, strlen(str)); + close(pidfd); + } + } + + build_object_tree(); + if (debug_mode) + dump_object_tree(); + + + /* On single core UP systems irqbalance obviously has no work to do */ + if (num_online_cpus() <= 1) { + char *msg = "Balancing is ineffective on systems with a " + "single cpu. Shutting down\n"; + + log(TO_ALL, LOG_WARNING, "%s", msg); + goto out; + } + + + g_unix_signal_add(SIGINT, handler, NULL); + g_unix_signal_add(SIGTERM, handler, NULL); + g_unix_signal_add(SIGUSR1, handler, NULL); + g_unix_signal_add(SIGUSR2, handler, NULL); + g_unix_signal_add(SIGHUP, force_rescan, NULL); + sigprocmask(SIG_SETMASK, &old_sigset, NULL); + +#ifdef HAVE_LIBCAP_NG + // Drop capabilities + capng_clear(CAPNG_SELECT_BOTH); + capng_lock(); + capng_apply(CAPNG_SELECT_BOTH); +#endif + for_each_irq(NULL, force_rebalance_irq, NULL); + + parse_proc_interrupts(); + parse_proc_stat(); + +// On EUREKA we don't use irqbalance UI. +#ifndef BUILD_EUREKA + if (init_socket()) { + ret = EXIT_FAILURE; + goto out; + } +#endif // !BUILD_EUREKA + main_loop = g_main_loop_new(NULL, FALSE); + last_interval = sleep_interval; + g_timeout_add_seconds(sleep_interval, scan, NULL); + g_main_loop_run(main_loop); + + g_main_loop_quit(main_loop); + +out: + free_object_tree(); + free_cl_opts(); + free(polscript); + + /* Remove pidfile */ + if (!foreground_mode && pidfile) + unlink(pidfile); + /* Remove socket */ + if (socket_fd > 0) + close(socket_fd); + if (socket_name[0]) + unlink(socket_name); + + return ret; +}
diff --git a/irqbalance.h b/irqbalance.h new file mode 100644 index 0000000..5b387fa --- /dev/null +++ b/irqbalance.h
@@ -0,0 +1,176 @@ +#ifndef __INCLUDE_GUARD_IRQBALANCE_H_ +#define __INCLUDE_GUARD_IRQBALANCE_H_ + + +#include "constants.h" + +#include "cpumask.h" + +#include <stdint.h> +#include <glib.h> +#include <glib-unix.h> +#include <syslog.h> +#include <limits.h> + +#include "types.h" +#include "config.h" + +#ifdef __aarch64__ +#define AARCH64 +#endif + +#ifdef HAVE_NUMA_H +#include <numa.h> +#else +#define numa_available() -1 +#endif + +#ifdef HAVE_LIBSYSTEMD +#include <systemd/sd-journal.h> +#endif + +#define NUMA_NO_NODE (-1) + +extern char *classes[]; + +extern void parse_cpu_tree(void); +extern void clear_work_stats(void); +extern void parse_proc_interrupts(void); +extern GList* collect_full_irq_list(); +extern void parse_proc_stat(void); +extern void set_interrupt_count(int number, uint64_t count); +extern void set_msi_interrupt_numa(int number); + +extern GList *rebalance_irq_list; + +void update_migration_status(void); +void dump_workloads(void); +void sort_irq_list(GList **list); +void calculate_placement(void); +void dump_tree(void); + +void activate_mappings(void); +void clear_cpu_tree(void); +void free_cpu_topo(gpointer data); + +/*===================NEW BALANCER FUNCTIONS============================*/ + +/* + * Master topo_obj type lists + */ +extern GList *numa_nodes; +extern GList *packages; +extern GList *cache_domains; +extern GList *cpus; +extern int numa_avail; +extern GList *cl_banned_irqs; + +extern int debug_mode; +extern int journal_logging; +extern int one_shot_mode; +extern int need_rescan; +extern int need_rebuild; +extern unsigned long long cycle_count; +extern unsigned long power_thresh; +extern unsigned long deepest_cache; +extern char *polscript; +extern cpumask_t banned_cpus; +extern cpumask_t unbanned_cpus; +extern long HZ; +extern unsigned long migrate_ratio; + +/* + * Numa node access routines + */ +extern void build_numa_node_list(void); +extern void free_numa_node_list(void); +extern void dump_numa_node_info(struct topo_obj *node, void *data); +extern void connect_cpu_mem_topo(struct topo_obj *p, void *data); +extern struct topo_obj *get_numa_node(int nodeid); + +/* + * cpu core functions + */ +#define cpu_numa_node(cpu) ((cpu)->parent->numa_nodes) +extern struct topo_obj *find_cpu_core(int cpunr); +extern int get_cpu_count(void); + +/* + * irq db functions + */ +extern void rebuild_irq_db(void); +extern void free_irq_db(void); +extern void add_cl_banned_irq(int irq); +extern void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data); +extern struct irq_info *get_irq_info(int irq); +extern void migrate_irq(GList **from, GList **to, struct irq_info *info); +extern void free_cl_opts(void); +extern void add_cl_banned_module(char *modname); +extern void add_banned_irq(int irq); +extern void remove_one_irq_from_db(int irq); +#define irq_numa_node(irq) ((irq)->numa_node) + + +/* + * Generic object functions + */ +static inline void for_each_object(GList *list, void (*cb)(struct topo_obj *obj, void *data), void *data) +{ + GList *entry, *next; + entry = g_list_first(list); + while (entry) { + next = g_list_next(entry); + cb(entry->data, data); + entry = next; + } +} + +// irqbalance version +#define VERSION "1.6" + +/* + * Logging functions + */ +#define TO_SYSLOG (1 << 0) +#define TO_CONSOLE (1 << 1) +#define TO_ALL (TO_SYSLOG | TO_CONSOLE) + +extern const char * log_indent; +extern unsigned int log_mask; +#ifdef HAVE_LIBSYSTEMD +#define log(mask, lvl, fmt, args...) do { \ + if (journal_logging) { \ + sd_journal_print(lvl, fmt, ##args); \ + if (log_mask & mask & TO_CONSOLE) \ + printf(fmt, ##args); \ + } else { \ + if (log_mask & mask & TO_SYSLOG) \ + syslog(lvl, fmt, ##args); \ + if (log_mask & mask & TO_CONSOLE) \ + printf(fmt, ##args); \ + } \ +}while(0) +#else /* ! HAVE_LIBSYSTEMD */ +#define log(mask, lvl, fmt, args...) do { \ + if (journal_logging) { \ + printf("<%d>", lvl); \ + printf(fmt, ##args); \ + } else { \ + if (log_mask & mask & TO_SYSLOG) \ + syslog(lvl, fmt, ##args); \ + if (log_mask & mask & TO_CONSOLE) \ + printf(fmt, ##args); \ + } \ +}while(0) +#endif /* HAVE_LIBSYSTEMD */ + +#define SOCKET_PATH "irqbalance" +#define SOCKET_TMPFS "/run/irqbalance/" + +extern int process_one_line(char *path, void (*cb)(char *line, void *data), void *data); +extern void get_mask_from_bitmap(char *line, void *mask); +extern void get_int(char *line, void *data); +extern void get_hex(char *line, void *data); + +#endif /* __INCLUDE_GUARD_IRQBALANCE_H_ */ +
diff --git a/irqlist.c b/irqlist.c new file mode 100644 index 0000000..9ab321a --- /dev/null +++ b/irqlist.c
@@ -0,0 +1,207 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +/* + * This file has the basic functions to manipulate interrupt metadata + */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <dirent.h> +#include <errno.h> +#include <math.h> + +#include "types.h" +#include "irqbalance.h" + + + +struct load_balance_info { + unsigned long long int total_load; + unsigned long long avg_load; + unsigned long long min_load; + unsigned long long adjustment_load; + int load_sources; + unsigned long long int deviations; + long double std_deviation; + unsigned int num_over; + unsigned int num_under; + unsigned int num_powersave; + struct topo_obj *powersave; +}; + +static void gather_load_stats(struct topo_obj *obj, void *data) +{ + struct load_balance_info *info = data; + + if (info->load_sources == 0 || obj->load < info->min_load) + info->min_load = obj->load; + info->total_load += obj->load; + info->load_sources += 1; +} + +static void compute_deviations(struct topo_obj *obj, void *data) +{ + struct load_balance_info *info = data; + unsigned long long int deviation; + + deviation = (obj->load > info->avg_load) ? + obj->load - info->avg_load : + info->avg_load - obj->load; + + info->deviations += (deviation * deviation); +} + +static void move_candidate_irqs(struct irq_info *info, void *data) +{ + struct load_balance_info *lb_info = data; + unsigned long delta_load = 0; + + /* Don't rebalance irqs that don't want it */ + if (info->level == BALANCE_NONE) + return; + + /* Don't move cpus that only have one irq, regardless of load */ + if (g_list_length(info->assigned_obj->interrupts) <= 1) + return; + + /* IRQs with a load of 1 have most likely not had any interrupts and + * aren't worth migrating + */ + if (info->load <= 1) + return; + + if (migrate_ratio > 0) { + delta_load = (lb_info->adjustment_load - lb_info->min_load) / migrate_ratio; + } + + /* If we can migrate an irq without swapping the imbalance do it. */ + if ((lb_info->min_load + info->load) - (lb_info->adjustment_load - info->load) < delta_load) { + lb_info->adjustment_load -= info->load; + lb_info->min_load += info->load; + if (lb_info->min_load > lb_info->adjustment_load) { + lb_info->min_load = lb_info->adjustment_load; + } + } else + return; + + log(TO_CONSOLE, LOG_INFO, "Selecting irq %d for rebalancing\n", info->irq); + + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + + info->assigned_obj = NULL; +} + +static void migrate_overloaded_irqs(struct topo_obj *obj, void *data) +{ + struct load_balance_info *info = data; + + if (obj->powersave_mode) + info->num_powersave++; + + if ((obj->load + info->std_deviation) <= info->avg_load) { + info->num_under++; + if (power_thresh != ULONG_MAX && !info->powersave) + if (!obj->powersave_mode) + info->powersave = obj; + } else if ((obj->load - info->std_deviation) >=info->avg_load) { + info->num_over++; + } + + if ((obj->load > info->min_load) && + (g_list_length(obj->interrupts) > 1)) { + /* order the list from greatest to least workload */ + sort_irq_list(&obj->interrupts); + /* + * Each irq carries a weighted average amount of load + * we think it's responsible for. This object's load is larger + * than the object with the minimum load. Select irqs for + * migration if we could move them to the minimum object + * without reversing the imbalance or until we only have one + * left. + */ + info->adjustment_load = obj->load; + for_each_irq(obj->interrupts, move_candidate_irqs, info); + } +} + +static void force_irq_migration(struct irq_info *info, void *data __attribute__((unused))) +{ + migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info); + info->assigned_obj = NULL; +} + +static void clear_powersave_mode(struct topo_obj *obj, void *data __attribute__((unused))) +{ + obj->powersave_mode = 0; +} + +static void find_overloaded_objs(GList *name, struct load_balance_info *info) { + memset(info, 0, sizeof(struct load_balance_info)); + for_each_object(name, gather_load_stats, info); + info->load_sources = (info->load_sources == 0) ? 1 : (info->load_sources); + info->avg_load = info->total_load / info->load_sources; + for_each_object(name, compute_deviations, info); + /* Don't divide by zero if there is a single load source */ + if (info->load_sources == 1) + info->std_deviation = 0; + else { + info->std_deviation = (long double)(info->deviations / (info->load_sources - 1)); + info->std_deviation = sqrt(info->std_deviation); + } + + for_each_object(name, migrate_overloaded_irqs, info); +} + +void update_migration_status(void) +{ + struct load_balance_info info; + find_overloaded_objs(cpus, &info); + if (power_thresh != ULONG_MAX && cycle_count > 5) { + if (!info.num_over && (info.num_under >= power_thresh) && info.powersave) { + log(TO_ALL, LOG_INFO, "cpu %d entering powersave mode\n", info.powersave->number); + info.powersave->powersave_mode = 1; + if (g_list_length(info.powersave->interrupts) > 0) + for_each_irq(info.powersave->interrupts, force_irq_migration, NULL); + } else if ((info.num_over) && (info.num_powersave)) { + log(TO_ALL, LOG_INFO, "Load average increasing, re-enabling all cpus for irq balancing\n"); + for_each_object(cpus, clear_powersave_mode, NULL); + } + } + find_overloaded_objs(cache_domains, &info); + find_overloaded_objs(packages, &info); + find_overloaded_objs(numa_nodes, &info); +} + +static void dump_workload(struct irq_info *info, void *unused __attribute__((unused))) +{ + log(TO_CONSOLE, LOG_INFO, "Interrupt %i node_num %d (class %s) has workload %lu \n", + info->irq, irq_numa_node(info)->number, classes[info->class], (unsigned long)info->load); +} + +void dump_workloads(void) +{ + for_each_irq(NULL, dump_workload, NULL); +} +
diff --git a/misc/90-irqbalance.rules b/misc/90-irqbalance.rules new file mode 100644 index 0000000..e9cf937 --- /dev/null +++ b/misc/90-irqbalance.rules
@@ -0,0 +1,5 @@ +# Udev rules for irqbalance. On every device add or remove, we want to rescan +# our irq list to make sure it hasn't changed +# + +ACTION=="add|remove", RUN+="/usr/bin/killall -SIGHUP irqbalance"
diff --git a/misc/irqbalance.env b/misc/irqbalance.env new file mode 100644 index 0000000..23570b2 --- /dev/null +++ b/misc/irqbalance.env
@@ -0,0 +1,29 @@ +# irqbalance is a daemon process that distributes interrupts across +# CPUs on SMP systems. The default is to rebalance once every 10 +# seconds. This is the environment file that is specified to systemd via the +# EnvironmentFile key in the service unit file (or via whatever method the init +# system you're using has). + +# +# IRQBALANCE_ONESHOT +# After starting, wait for a minute, then look at the interrupt +# load and balance it once; after balancing exit and do not change +# it again. +# +#IRQBALANCE_ONESHOT= + +# +# IRQBALANCE_BANNED_CPUS +# 64 bit bitmask which allows you to indicate which CPUs should +# be skipped when reblancing IRQs. CPU numbers which have their +# corresponding bits set to one in this mask will not have any +# IRQs assigned to them on rebalance. +# +#IRQBALANCE_BANNED_CPUS= + +# +# IRQBALANCE_ARGS +# Append any args here to the irqbalance daemon as documented in the man +# page. +# +#IRQBALANCE_ARGS=
diff --git a/misc/irqbalance.policy.d/follow-affinity-hint.example b/misc/irqbalance.policy.d/follow-affinity-hint.example new file mode 100644 index 0000000..e8b134e --- /dev/null +++ b/misc/irqbalance.policy.d/follow-affinity-hint.example
@@ -0,0 +1,39 @@ +#!/bin/sh +# Do not edit this file, create your own policy script and make it +# executable for irqbalance process, you can use this file as an +# boilerplate. + +SYS_DEV_PATH=$1 +IRQ_NUM=$2 + +IRQ_PATH=/proc/irq/$IRQ_NUM +UEVENT_FILE=$SYS_DEV_PATH/uevent + +# Scripts below is an example for banning certain IRQs from +# irqbalance and strictly apply their affinity_hint setting +[[ ! -e $UEVENT_FILE ]] && exit 1 + +# IRQs from following drivers will be handled by this script +# Driver names should be separated by space +AFFINITY_WHITELIST="" + +while read line; do + if [[ $line == "DRIVER="* ]] && \ + [[ " $AFFINITY_WHITELIST " == *" ${line#DRIVER=} "* ]]; then + affinity_hint=$(cat $IRQ_PATH/affinity_hint 2>/dev/null) + # Check if affinity_hint value have at least one bit set + if [[ ! "$affinity_hint" =~ ^[0,]*$ ]]; then + # Ban it from irqbalance so it won't get balanced, + # we'll follow its affinity_hint setting + echo "ban=true" + # If the affinity_hint value is valid, kernel would set + # the same value for smp_affinity. But force to set that + # again in case the IRQ was balanced before. + echo "$affinity_hint" > $IRQ_PATH/smp_affinity + # Stop further script processing + exit 0 + fi + fi +done <<< "$(cat $UEVENT_FILE)" + +exit 1
diff --git a/misc/irqbalance.service b/misc/irqbalance.service new file mode 100644 index 0000000..e7a3336 --- /dev/null +++ b/misc/irqbalance.service
@@ -0,0 +1,19 @@ +[Unit] +Description=irqbalance daemon +Documentation=man:irqbalance(1) +Documentation=https://github.com/Irqbalance/irqbalance +ConditionVirtualization=!container + +[Service] +EnvironmentFile=-/usr/lib/irqbalance/defaults.env +EnvironmentFile=-/path/to/irqbalance.env +ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS +CapabilityBoundingSet= +NoNewPrivileges=yes +ReadOnlyPaths=/ +ReadWritePaths=/proc/irq +RestrictAddressFamilies=AF_UNIX +RuntimeDirectory=irqbalance/ + +[Install] +WantedBy=multi-user.target
diff --git a/non-atomic.h b/non-atomic.h new file mode 100644 index 0000000..943501a --- /dev/null +++ b/non-atomic.h
@@ -0,0 +1,115 @@ +/* + +This file is copied from the Linux kernel and mildly adjusted for use in userspace + + +*/ +#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ +#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ + +#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + + *p |= mask; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + + *p &= ~mask; +} + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + + *p ^= mask; +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long old = *p; + + *p = old | mask; + return (old & mask) != 0; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long old = *p; + + *p = old & ~mask; + return (old & mask) != 0; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, + volatile unsigned long *addr) +{ + unsigned long mask = BITOP_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); + unsigned long old = *p; + + *p = old ^ mask; + return (old & mask) != 0; +} + +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/numa.c b/numa.c new file mode 100644 index 0000000..13d7ebd --- /dev/null +++ b/numa.c
@@ -0,0 +1,155 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ + +/* + * This file tries to map numa affinity of pci devices to their interrupts + * In addition the PCI class information is used to refine the classification + * of interrupt sources + */ +#include "config.h" +#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> +#include <ctype.h> +#include <sys/types.h> +#include <dirent.h> + +#include "irqbalance.h" + +#define SYSFS_NODE_PATH "/sys/devices/system/node" + +GList *numa_nodes = NULL; + +static void add_one_node(int nodeid) +{ + char path[PATH_MAX]; + struct topo_obj *new; + + new = calloc(1, sizeof(struct topo_obj)); + if (!new) { + need_rebuild = 1; + return; + } + + if (nodeid == NUMA_NO_NODE) { + cpus_setall(new->mask); + } else { + cpus_clear(new->mask); + sprintf(path, "%s/node%d/cpumap", SYSFS_NODE_PATH, nodeid); + process_one_line(path, get_mask_from_bitmap, &new->mask); + } + + new->obj_type = OBJ_TYPE_NODE; + new->number = nodeid; + new->obj_type_list = &numa_nodes; + numa_nodes = g_list_append(numa_nodes, new); +} + +void build_numa_node_list(void) +{ + DIR *dir; + struct dirent *entry; + + /* Add the unspecified node */ + add_one_node(NUMA_NO_NODE); + + if (!numa_avail) + return; + + dir = opendir(SYSFS_NODE_PATH); + if (!dir) + return; + + do { + entry = readdir(dir); + if (!entry) + break; + if ((entry->d_type == DT_DIR) && + (strncmp(entry->d_name, "node", 4) == 0) && + isdigit(entry->d_name[4])) { + add_one_node(strtoul(&entry->d_name[4], NULL, 10)); + } + } while (entry); + closedir(dir); +} + +void free_numa_node_list(void) +{ + g_list_free_full(numa_nodes, free_cpu_topo); + numa_nodes = NULL; +} + +static gint compare_node(gconstpointer a, gconstpointer b) +{ + const struct topo_obj *ai = a; + const struct topo_obj *bi = b; + + return (ai->number == bi->number) ? 0 : 1; +} + +void connect_cpu_mem_topo(struct topo_obj *p, void *data __attribute__((unused))) +{ + GList *entry; + struct topo_obj *node; + int len; + + len = g_list_length(p->numa_nodes); + + if (len == 0) { + return; + } else if (len > 1) { + for_each_object(p->children, connect_cpu_mem_topo, NULL); + return; + } + + entry = g_list_first(p->numa_nodes); + node = entry->data; + + if (p->obj_type == OBJ_TYPE_PACKAGE && !p->parent) + p->parent = node; + + entry = g_list_find(node->children, p); + if (!entry) + node->children = g_list_append(node->children, p); +} + +void dump_numa_node_info(struct topo_obj *d, void *unused __attribute__((unused))) +{ + char buffer[4096]; + + log(TO_CONSOLE, LOG_INFO, "NUMA NODE NUMBER: %d\n", d->number); + cpumask_scnprintf(buffer, 4096, d->mask); + log(TO_CONSOLE, LOG_INFO, "LOCAL CPU MASK: %s\n", buffer); + log(TO_CONSOLE, LOG_INFO, "\n"); +} + +struct topo_obj *get_numa_node(int nodeid) +{ + struct topo_obj find; + GList *entry; + + find.number = numa_avail ? nodeid : NUMA_NO_NODE; + + entry = g_list_find_custom(numa_nodes, &find, compare_node); + return entry ? entry->data : NULL; +} +
diff --git a/placement.c b/placement.c new file mode 100644 index 0000000..17a9f2e --- /dev/null +++ b/placement.c
@@ -0,0 +1,198 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhoramn@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> + +#include "types.h" +#include "irqbalance.h" + + +GList *rebalance_irq_list; + +struct obj_placement { + struct topo_obj *best; + uint64_t best_cost; + struct irq_info *info; +}; + +static void find_best_object(struct topo_obj *d, void *data) +{ + struct obj_placement *best = (struct obj_placement *)data; + uint64_t newload; + + /* + * Don't consider the unspecified numa node here + */ + if (numa_avail && (d->obj_type == OBJ_TYPE_NODE) && (d->number == NUMA_NO_NODE)) + return; + + /* + * also don't consider any node that doesn't have at least one cpu in + * the unbanned list + */ + if ((d->obj_type == OBJ_TYPE_NODE) && + (!cpus_intersects(d->mask, unbanned_cpus))) + return; + + if (d->powersave_mode) + return; + + newload = d->load; + if (newload < best->best_cost) { + best->best = d; + best->best_cost = newload; + } else if (newload == best->best_cost) { + if (g_list_length(d->interrupts) < g_list_length(best->best->interrupts)) { + best->best = d; + } + } +} + +static void find_best_object_for_irq(struct irq_info *info, void *data) +{ + struct obj_placement place; + struct topo_obj *d = data; + struct topo_obj *asign; + + if (!info->moved) + return; + + switch (d->obj_type) { + case OBJ_TYPE_NODE: + if (info->level == BALANCE_NONE) + return; + break; + + case OBJ_TYPE_PACKAGE: + if (info->level == BALANCE_PACKAGE) + return; + break; + + case OBJ_TYPE_CACHE: + if (info->level == BALANCE_CACHE) + return; + break; + + case OBJ_TYPE_CPU: + if (info->level == BALANCE_CORE) + return; + break; + } + + place.info = info; + place.best = NULL; + place.best_cost = ULLONG_MAX; + + for_each_object(d->children, find_best_object, &place); + + asign = place.best; + + if (asign) { + migrate_irq(&d->interrupts, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; + } +} + +static void place_irq_in_object(struct topo_obj *d, void *data __attribute__((unused))) +{ + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, find_best_object_for_irq, d); +} + +static void place_irq_in_node(struct irq_info *info, void *data __attribute__((unused))) +{ + struct obj_placement place; + struct topo_obj *asign; + + if ((info->level == BALANCE_NONE) && cpus_empty(banned_cpus)) + return; + + if (irq_numa_node(info)->number != NUMA_NO_NODE || !numa_avail) { + /* + * Need to make sure this node is elligible for migration + * given the banned cpu list + */ + if (!cpus_intersects(irq_numa_node(info)->mask, unbanned_cpus)) + goto find_placement; + /* + * This irq belongs to a device with a preferred numa node + * put it on that node + */ + migrate_irq(&rebalance_irq_list, &irq_numa_node(info)->interrupts, info); + info->assigned_obj = irq_numa_node(info); + irq_numa_node(info)->load += info->load + 1; + + return; + } + +find_placement: + place.best_cost = ULLONG_MAX; + place.best = NULL; + place.info = info; + + for_each_object(numa_nodes, find_best_object, &place); + + asign = place.best; + + if (asign) { + migrate_irq(&rebalance_irq_list, &asign->interrupts, info); + info->assigned_obj = asign; + asign->load += info->load; + } +} + +static void validate_irq(struct irq_info *info, void *data) +{ + if (info->assigned_obj != data) + log(TO_CONSOLE, LOG_INFO, "object validation error: irq %d is wrong, points to %p, should be %p\n", + info->irq, info->assigned_obj, data); +} + +static void validate_object(struct topo_obj *d, void *data __attribute__((unused))) +{ + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, validate_irq, d); +} + +static void validate_object_tree_placement(void) +{ + for_each_object(packages, validate_object, NULL); + for_each_object(cache_domains, validate_object, NULL); + for_each_object(cpus, validate_object, NULL); +} + +void calculate_placement(void) +{ + sort_irq_list(&rebalance_irq_list); + if (g_list_length(rebalance_irq_list) > 0) { + for_each_irq(rebalance_irq_list, place_irq_in_node, NULL); + for_each_object(numa_nodes, place_irq_in_object, NULL); + for_each_object(packages, place_irq_in_object, NULL); + for_each_object(cache_domains, place_irq_in_object, NULL); + } + if (debug_mode) + validate_object_tree_placement(); +}
diff --git a/procinterrupts.c b/procinterrupts.c new file mode 100644 index 0000000..858b66b --- /dev/null +++ b/procinterrupts.c
@@ -0,0 +1,519 @@ +/* + * Copyright (C) 2006, Intel Corporation + * Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com> + * + * This file is part of irqbalance + * + * This program file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program in a file named COPYING; if not, write to the + * Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301 USA + */ +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> +#include <string.h> +#include <syslog.h> +#include <ctype.h> +#include <errno.h> + +#include "cpumask.h" +#include "irqbalance.h" + +#ifdef AARCH64 +#include <sys/types.h> +#include <regex.h> +#include <dirent.h> +#endif + +#define LINESIZE 4096 + +static int proc_int_has_msi = 0; +static int msi_found_in_sysfs = 0; + +#ifdef AARCH64 +struct irq_match { + char *matchstring; + regex_t rcomp; + int (*refine_match)(char *name, struct irq_info *info); + int type; + int class; +}; + +static int check_platform_device(char *name, struct irq_info *info) +{ + DIR *dirfd; + char path[512]; + struct dirent *ent; + int rc = -ENOENT, i; + static struct pdev_irq_info { + char *d_name; + int type; + int class; + } pdev_irq_info[] = { + {"ata", IRQ_TYPE_LEGACY, IRQ_SCSI}, + {"net", IRQ_TYPE_LEGACY, IRQ_ETH}, + {"usb", IRQ_TYPE_LEGACY, IRQ_OTHER}, + {NULL}, + }; + + memset(path, 0, 512); + + strcat(path, "/sys/devices/platform/"); + strcat(path, name); + strcat(path, "/"); + dirfd = opendir(path); + + if (!dirfd) { + log(TO_ALL, LOG_DEBUG, "No directory %s: %s\n", path, strerror(errno)); + return -ENOENT; + } + + while ((ent = readdir(dirfd)) != NULL) { + + log(TO_ALL, LOG_DEBUG, "Checking entry %s\n", ent->d_name); + for (i = 0; pdev_irq_info[i].d_name != NULL; i++) { + if (!strncmp(ent->d_name, pdev_irq_info[i].d_name, strlen(pdev_irq_info[i].d_name))) { + info->type = pdev_irq_info[i].type; + info->class = pdev_irq_info[i].class; + rc = 0; + goto out; + } + } + } + +out: + closedir(dirfd); + log(TO_ALL, LOG_DEBUG, "IRQ %s is of type %d and class %d\n", name, info->type, info->class); + return rc; + +} + +static void guess_arm_irq_hints(char *name, struct irq_info *info) +{ + int i, rc; + static int compiled = 0; + /* Note: Last entry is a catchall */ + static struct irq_match matches[] = { + { "eth.*" ,{NULL} ,NULL, IRQ_TYPE_LEGACY, IRQ_GBETH }, + { "[A-Z0-9]{4}[0-9a-f]{4}", {NULL} ,check_platform_device, IRQ_TYPE_LEGACY, IRQ_OTHER}, + { "PNP[0-9a-f]{4}", {NULL} ,check_platform_device, IRQ_TYPE_LEGACY, IRQ_OTHER}, + { ".*", {NULL}, NULL, IRQ_TYPE_LEGACY, IRQ_OTHER}, + {NULL}, + }; + + + if (!compiled) { + for (i=0; matches[i].matchstring != NULL; i++) { + rc = regcomp(&matches[i].rcomp, matches[i].matchstring, REG_EXTENDED | REG_NOSUB); + if (rc) { + char errbuf[256]; + regerror(rc, &matches[i].rcomp, errbuf, 256); + log(TO_ALL, LOG_WARNING, "WARNING: Failed to compile regex %s : %s\n", + matches[i].matchstring, errbuf); + return; + } + } + + compiled = 1; + } + + for (i=0; matches[i].matchstring != NULL; i++) { + if (!regexec(&matches[i].rcomp, name, 0, NULL, 0)) { + info->type = matches[i].type; + info->class = matches[i].class; + if (matches[i].refine_match) + matches[i].refine_match(name, info); + log(TO_ALL, LOG_DEBUG, "IRQ %s(%d) guessed as class %d\n", name, info->irq,info->class); + break; + } + } + + +} +#endif + +GList* collect_full_irq_list() +{ + GList *tmp_list = NULL; + FILE *file; + char *line = NULL; + size_t size = 0; + char *irq_name, *irq_mod, *savedptr, *last_token, *p; +#ifdef AARCH64 + char *tmp; +#endif + + file = fopen("/proc/interrupts", "r"); + if (!file) + return NULL; + + /* first line is the header we don't need; nuke it */ + if (getline(&line, &size, file)<=0) { + free(line); + fclose(file); + return NULL; + } + + while (!feof(file)) { + int number; + int is_xen_dyn = 0; + struct irq_info *info; + char *c; + char *savedline = NULL; + + if (getline(&line, &size, file)<=0) + break; + + /* lines with letters in front are special, like NMI count. Ignore */ + c = line; + while (isblank(*(c))) + c++; + + if (!isdigit(*c)) + break; + c = strchr(line, ':'); + if (!c) + continue; + + savedline = strdup(line); + if (!savedline) + break; + irq_name = strtok_r(savedline, " ", &savedptr); + if (strstr(irq_name, "xen-dyn") != NULL) + is_xen_dyn = 1; + last_token = strtok_r(NULL, " ", &savedptr); + while ((p = strtok_r(NULL, " ", &savedptr))) { + irq_name = last_token; + if (strstr(irq_name, "xen-dyn") != NULL) + is_xen_dyn = 1; + last_token = p; + } + +#ifdef AARCH64 + /* Of course the formatting for /proc/interrupts is different on different arches */ + irq_name = last_token; + tmp = strchr(irq_name, '\n'); + if (tmp) + *tmp = 0; +#endif + irq_mod = last_token; + + *c = 0; + number = strtoul(line, NULL, 10); + + info = calloc(1, sizeof(struct irq_info)); + if (info) { + info->irq = number; + if (strstr(irq_name, "-event") != NULL && is_xen_dyn == 1) { + info->type = IRQ_TYPE_VIRT_EVENT; + info->class = IRQ_VIRT_EVENT; + } else { +#ifdef AARCH64 + guess_arm_irq_hints(irq_name, info); +#else + info->type = IRQ_TYPE_LEGACY; + info->class = IRQ_OTHER; +#endif + } + info->name = strdup(irq_mod); + tmp_list = g_list_append(tmp_list, info); + } + free(savedline); + } + fclose(file); + free(line); + return tmp_list; +} + +void parse_proc_interrupts(void) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + + file = fopen("/proc/interrupts", "r"); + if (!file) + return; + + /* first line is the header we don't need; nuke it */ + if (getline(&line, &size, file)<=0) { + free(line); + fclose(file); + return; + } + + while (!feof(file)) { + int cpunr; + int number; + uint64_t count; + char *c, *c2; + struct irq_info *info; + + if (getline(&line, &size, file)<=0) + break; + + if (!proc_int_has_msi) + if (strstr(line, "MSI") != NULL) + proc_int_has_msi = 1; + + /* lines with letters in front are special, like NMI count. Ignore */ + c = line; + while (isblank(*(c))) + c++; + + if (!isdigit(*c)) + break; + c = strchr(line, ':'); + if (!c) + continue; + + *c = 0; + c++; + number = strtoul(line, NULL, 10); + + info = get_irq_info(number); + if (!info) { + need_rescan = 1; + break; + } + + count = 0; + cpunr = 0; + + c2=NULL; + while (1) { + uint64_t C; + C = strtoull(c, &c2, 10); + if (c==c2) /* end of numbers */ + break; + count += C; + c=c2; + cpunr++; + } + if (cpunr != num_online_cpus()) { + need_rescan = 1; + break; + } + + /* IRQ removed and reinserted, need restart or this will + * cause an overflow and IRQ won't be rebalanced again + */ + if (count < info->irq_count) { + need_rescan = 1; + break; + } + + info->last_irq_count = info->irq_count; + info->irq_count = count; + + /* is interrupt MSI based? */ + if ((info->type == IRQ_TYPE_MSI) || (info->type == IRQ_TYPE_MSIX)) + msi_found_in_sysfs = 1; + } + if ((proc_int_has_msi) && (!msi_found_in_sysfs) && (!need_rescan)) { + log(TO_ALL, LOG_WARNING, "WARNING: MSI interrupts found in /proc/interrupts\n"); + log(TO_ALL, LOG_WARNING, "But none found in sysfs, you need to update your kernel\n"); + log(TO_ALL, LOG_WARNING, "Until then, IRQs will be improperly classified\n"); + /* + * Set msi_foun_in_sysfs, so we don't get this error constantly + */ + msi_found_in_sysfs = 1; + } + fclose(file); + free(line); +} + + +static void assign_load_slice(struct irq_info *info, void *data) +{ + uint64_t *load_slice = data; + info->load = (info->irq_count - info->last_irq_count) * *load_slice; + + /* + * Every IRQ has at least a load of 1 + */ + if (!info->load) + info->load++; +} + +/* + * Recursive helper to estimate the number of irqs shared between + * multiple topology objects that was handled by this particular object + */ +static uint64_t get_parent_branch_irq_count_share(struct topo_obj *d) +{ + uint64_t total_irq_count = 0; + + if (d->parent) { + total_irq_count = get_parent_branch_irq_count_share(d->parent); + total_irq_count /= g_list_length((d->parent)->children); + } + + total_irq_count += d->irq_count; + + return total_irq_count; +} + +static void get_children_branch_irq_count(struct topo_obj *d, void *data) +{ + uint64_t *total_irq_count = data; + + if (g_list_length(d->children) > 0) + for_each_object(d->children, get_children_branch_irq_count, total_irq_count); + + *total_irq_count += d->irq_count; +} + +static void compute_irq_branch_load_share(struct topo_obj *d, void *data __attribute__((unused))) +{ + uint64_t local_irq_counts = 0; + uint64_t load_slice; + + if (g_list_length(d->interrupts) > 0) { + local_irq_counts = get_parent_branch_irq_count_share(d); + if (g_list_length(d->children) > 0) + for_each_object(d->children, get_children_branch_irq_count, &local_irq_counts); + load_slice = local_irq_counts ? (d->load / local_irq_counts) : 1; + for_each_irq(d->interrupts, assign_load_slice, &load_slice); + } + +} + +static void accumulate_irq_count(struct irq_info *info, void *data) +{ + uint64_t *acc = data; + + *acc += (info->irq_count - info->last_irq_count); +} + +static void accumulate_interrupts(struct topo_obj *d, void *data __attribute__((unused))) +{ + if (g_list_length(d->children) > 0) { + for_each_object(d->children, accumulate_interrupts, NULL); + } + + d->irq_count = 0; + if (g_list_length(d->interrupts) > 0) + for_each_irq(d->interrupts, accumulate_irq_count, &(d->irq_count)); +} + +static void accumulate_load(struct topo_obj *d, void *data) +{ + uint64_t *load = data; + + *load += d->load; +} + +static void set_load(struct topo_obj *d, void *data __attribute__((unused))) +{ + if (g_list_length(d->children) > 0) { + for_each_object(d->children, set_load, NULL); + d->load = 0; + for_each_object(d->children, accumulate_load, &(d->load)); + } +} + +void parse_proc_stat(void) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + int cpunr, rc, cpucount; + struct topo_obj *cpu; + unsigned long long irq_load, softirq_load; + + file = fopen("/proc/stat", "r"); + if (!file) { + log(TO_ALL, LOG_WARNING, "WARNING cant open /proc/stat. balancing is broken\n"); + return; + } + + /* first line is the header we don't need; nuke it */ + if (getline(&line, &size, file)<=0) { + free(line); + log(TO_ALL, LOG_WARNING, "WARNING read /proc/stat. balancing is broken\n"); + fclose(file); + return; + } + + cpucount = 0; + while (!feof(file)) { + if (getline(&line, &size, file)<=0) + break; + + if (!strstr(line, "cpu")) + break; + + cpunr = strtoul(&line[3], NULL, 10); + + if (cpu_isset(cpunr, banned_cpus)) + continue; + + rc = sscanf(line, "%*s %*u %*u %*u %*u %*u %llu %llu", &irq_load, &softirq_load); + if (rc < 2) + break; + + cpu = find_cpu_core(cpunr); + + if (!cpu) + break; + + cpucount++; + + /* + * For each cpu add the irq and softirq load and propagate that + * all the way up the device tree + */ + if (cycle_count) { + cpu->load = (irq_load + softirq_load) - (cpu->last_load); + /* + * the [soft]irq_load values are in jiffies, with + * HZ jiffies per second. Convert the load to nanoseconds + * to get a better integer resolution of nanoseconds per + * interrupt. + */ + cpu->load *= NSEC_PER_SEC/HZ; + } + cpu->last_load = (irq_load + softirq_load); + } + + fclose(file); + free(line); + if (cpucount != get_cpu_count()) { + log(TO_ALL, LOG_WARNING, "WARNING, didn't collect load info for all cpus, balancing is broken\n"); + return; + } + + /* + * Set the load values for all objects above cpus + */ + for_each_object(numa_nodes, set_load, NULL); + + /* + * Collect local irq_count on each object + */ + for_each_object(numa_nodes, accumulate_interrupts, NULL); + + /* + * Now that we have load for each cpu attribute a fair share of the load + * to each irq on that cpu + */ + for_each_object(cpus, compute_irq_branch_load_share, NULL); + for_each_object(cache_domains, compute_irq_branch_load_share, NULL); + for_each_object(packages, compute_irq_branch_load_share, NULL); + for_each_object(numa_nodes, compute_irq_branch_load_share, NULL); + +}
diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..293f1e7 --- /dev/null +++ b/tests/Makefile.am
@@ -0,0 +1,2 @@ +check_SCRIPTS = runoneshot.sh +TESTS = runoneshot.sh
diff --git a/tests/runoneshot.sh b/tests/runoneshot.sh new file mode 100755 index 0000000..9fb6ecd --- /dev/null +++ b/tests/runoneshot.sh
@@ -0,0 +1,4 @@ +#!/bin/sh + +exec ../irqbalance --debug --oneshot --foreground +
diff --git a/types.h b/types.h new file mode 100644 index 0000000..a01d649 --- /dev/null +++ b/types.h
@@ -0,0 +1,77 @@ +#ifndef _INCLUDE_GUARD_TYPES_H +#define _INCLUDE_GUARD_TYPES_H + +#include <glib.h> + +#include "cpumask.h" + +#define BALANCE_NONE 0 +#define BALANCE_PACKAGE 1 +#define BALANCE_CACHE 2 +#define BALANCE_CORE 3 + +/* + * IRQ Classes + */ +#define IRQ_NODEF -1 +#define IRQ_OTHER 0 +#define IRQ_LEGACY 1 +#define IRQ_SCSI 2 +#define IRQ_VIDEO 3 +#define IRQ_ETH 4 +#define IRQ_GBETH 5 +#define IRQ_10GBETH 6 +#define IRQ_VIRT_EVENT 7 + +/* + * IRQ Types + */ +#define IRQ_TYPE_LEGACY 0 +#define IRQ_TYPE_MSI 1 +#define IRQ_TYPE_MSIX 2 +#define IRQ_TYPE_VIRT_EVENT 3 + +/* + * IRQ Internal tracking flags + */ +#define IRQ_FLAG_BANNED 1 + +enum obj_type_e { + OBJ_TYPE_CPU, + OBJ_TYPE_CACHE, + OBJ_TYPE_PACKAGE, + OBJ_TYPE_NODE +}; + +struct topo_obj { + uint64_t load; + uint64_t last_load; + uint64_t irq_count; + enum obj_type_e obj_type; + int number; + int powersave_mode; + cpumask_t mask; + GList *interrupts; + struct topo_obj *parent; + GList *children; + GList *numa_nodes; + GList **obj_type_list; +}; + +struct irq_info { + int irq; + int class; + int type; + int level; + int flags; + struct topo_obj *numa_node; + cpumask_t cpumask; + uint64_t irq_count; + uint64_t last_irq_count; + uint64_t load; + int moved; + struct topo_obj *assigned_obj; + char *name; +}; + +#endif
diff --git a/ui/helpers.c b/ui/helpers.c new file mode 100644 index 0000000..5d71275 --- /dev/null +++ b/ui/helpers.c
@@ -0,0 +1,170 @@ + +#include <glib.h> +#include <glib-unix.h> +#include <stdio.h> +#include <stdlib.h> +#include "helpers.h" +#include "ui.h" + + +gint sort_ints(gconstpointer First, gconstpointer Second) +{ + int *first = (int *)First; + int *second = (int *)Second; + if(*first < *second) { + return -1; + } + if(*first == *second) { + return 0; + } + if(*first > *second) { + return 1; + } + return 1; +} + +gint sort_all_cpus(gconstpointer First, gconstpointer Second) +{ + cpu_ban_t *first, *second; + first = (cpu_ban_t *)First; + second = (cpu_ban_t *)Second; + + if(first->number < second->number) { + return -1; + } + if(first->number == second->number) { + /* This should never happen */ + return 0; + } + if(first->number > second->number) { + return 1; + } + return 1; +} + +gint sort_all_irqs(gconstpointer First, gconstpointer Second) +{ + irq_t *first, *second; + first = (irq_t *)First; + second = (irq_t *)Second; + + if(first->vector < second->vector) { + return -1; + } + if(first->vector == second->vector) { + /* This should never happen */ + return 0; + } + if(first->vector > second->vector) { + return 1; + } + return 1; +} + +char * hex_to_bitmap(char hex_digit) { + uint8_t digit = 0; + if((hex_digit >= '0') && (hex_digit <= '9')) { + digit = hex_digit - '0'; + } else if((hex_digit >= 'a') && (hex_digit <= 'f')) { + digit = hex_digit - 'a' + 10; + } else if((hex_digit >= 'A') && (hex_digit <= 'F')) { + digit = hex_digit - 'A' + 10; + } else { + return "0000\0"; + } + + char *bitmap = malloc(5 * sizeof(char)); + bitmap[4] = '\0'; + int i; + for(i = 3; i >= 0; i--) { + bitmap[i] = digit % 2 ? '1' : '0'; + digit /= 2; + } + return bitmap; +} + +gpointer copy_cpu_ban(gconstpointer src, gpointer data __attribute__((unused))) +{ + cpu_ban_t *old = (cpu_ban_t *)src; + cpu_ban_t *new = malloc(sizeof(cpu_ban_t)); + new->number = old->number; + new->is_banned = old->is_banned; + return new; +} + +gpointer copy_irq(gconstpointer src, gpointer data __attribute__((unused))) +{ + irq_t *old = (irq_t *)src; + irq_t *new = malloc(sizeof(irq_t)); + new->vector = old->vector; + new->load = old->load; + new->diff = old->diff; + new->is_banned = old->is_banned; + new->class = old->class; + new->assigned_to = g_list_copy(old->assigned_to); + return new; +} + +void for_each_cpu(GList *list, void (*fp)(cpu_ban_t *cpu, void *data), void *data) +{ + GList *entry; + entry = g_list_first(list); + while(entry) { + fp(entry->data, data); + entry = g_list_next(entry); + } +} + +void for_each_int(GList *list, void (*fp)(int *number, void *data), void *data) +{ + GList *entry; + entry = g_list_first(list); + while(entry) { + fp(entry->data, data); + entry = g_list_next(entry); + } +} + +void for_each_irq(GList *list, void (*fp)(irq_t *irq, void *data), void *data) +{ + GList *entry; + entry = g_list_first(list); + while(entry) { + fp(entry->data, data); + entry = g_list_next(entry); + } +} + +void for_each_node(GList *list, void (*fp)(cpu_node_t *node, void *data), void *data) +{ + GList *entry; + entry = g_list_first(list); + while(entry) { + fp(entry->data, data); + entry = g_list_next(entry); + } +} + +/* Programmer debugging functions */ + +void dump_irq(irq_t *irq, void *data __attribute__((unused))) +{ + printf("IRQ %d\n", irq->vector); +} + +void dump_node(cpu_node_t *node, void *data __attribute__((unused))) +{ + printf("TYPE %d NUMBER %d\n", node->type, node->number); + if(g_list_length(node->irqs) > 0) { + for_each_irq(node->irqs, dump_irq, NULL); + } + if(g_list_length(node->children) > 0) { + for_each_node(node->children, dump_node, NULL); + } +} + +void dump_tree() +{ + for_each_node(tree, dump_node, NULL); +} +
diff --git a/ui/helpers.h b/ui/helpers.h new file mode 100644 index 0000000..b8d9fcc --- /dev/null +++ b/ui/helpers.h
@@ -0,0 +1,31 @@ + +#ifndef HELPERS_H +#define HELPERS_H + +#include "irqbalance-ui.h" + +extern GList *tree; + + +/* Helper functions */ + +gint sort_ints(gconstpointer First, gconstpointer Second); +gint sort_all_cpus(gconstpointer First, gconstpointer Second); +gint sort_all_irqs(gconstpointer First, gconstpointer Second); +char * hex_to_bitmap(char hex_digit); +gpointer copy_cpu_ban(gconstpointer src, gpointer data); +gpointer copy_irq(gconstpointer src, gpointer data); +void for_each_cpu(GList *list, void (*fp)(cpu_ban_t *cpu, void *data), void *data); +void for_each_int(GList *list, void (*fp)(int *number, void *data), void *data); +void for_each_irq(GList *list, void (*fp)(irq_t *irq, void *data), void *data); +void for_each_node(GList *list, void (*fp)(cpu_node_t *node, void *data), void *data); + + +/* Programmer debugging functions */ + +void dump_irq(irq_t *irq, void *data __attribute__((unused))); +void dump_node(cpu_node_t *node, void *data __attribute__((unused))); +void dump_tree(); + + +#endif /* HELPERS_H */
diff --git a/ui/irqbalance-ui.c b/ui/irqbalance-ui.c new file mode 100644 index 0000000..ed8f408 --- /dev/null +++ b/ui/irqbalance-ui.c
@@ -0,0 +1,446 @@ + +#include <ctype.h> +#include <errno.h> +#include <netdb.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/un.h> +#include <unistd.h> +#include <curses.h> +#include <ncurses.h> +#include "irqbalance-ui.h" +#include "ui.h" +#include "helpers.h" + + +int irqbalance_pid = -1; +GList *tree = NULL; +setup_t setup; +GMainLoop *main_loop; +int is_tree = 1; + +struct msghdr * create_credentials_msg() +{ + struct ucred *credentials = malloc(sizeof(struct ucred)); + credentials->pid = getpid(); + credentials->uid = geteuid(); + credentials->gid = getegid(); + + struct msghdr *msg = malloc(sizeof(struct msghdr)); + memset(msg, 0, sizeof(struct msghdr)); + msg->msg_iovlen = 1; + msg->msg_control = malloc(CMSG_SPACE(sizeof(struct ucred))); + msg->msg_controllen = CMSG_SPACE(sizeof(struct ucred)); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_CREDENTIALS; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + memcpy(CMSG_DATA(cmsg), credentials, sizeof(struct ucred)); + + free(credentials); + return msg; +} + +int init_connection() +{ + struct sockaddr_un addr; + memset(&addr, 0, sizeof(struct sockaddr_un)); + + int socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0); + if(socket_fd < 0) { + perror("Error opening socket"); + return 0; + } + addr.sun_family = AF_UNIX; + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s%d.sock", SOCKET_TMPFS, + SOCKET_PATH, irqbalance_pid); + + if(connect(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + /* Try connect to abstract */ + memset(&addr, 0, sizeof(struct sockaddr_un)); + addr.sun_family = AF_UNIX; + if (connect(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(socket_fd); + return 0; + } + + } + + return socket_fd; +} + +void send_settings(char *data) +{ + /* Send "settings sleep X" to set sleep interval, "settings ban + * irqs X Y..." to ban IRQs from balancing, + * "settings cpus <banned_list>" to setup which CPUs are forbidden + * to handle IRQs + */ + int socket_fd = init_connection(); + if(!socket_fd) { + return; + } + + struct msghdr *msg = create_credentials_msg(); + struct iovec iov; + iov.iov_base = (void *) data; + iov.iov_len = strlen(data); + msg->msg_iov = &iov; + sendmsg(socket_fd, msg, 0); + + close(socket_fd); + free(msg->msg_control); + free(msg); +} + +char * get_data(char *string) +{ + /* Send "setup" to get sleep interval, banned IRQs and banned CPUs, + * "stats" to get CPU tree statistics + */ + int socket_fd = init_connection(); + if(!socket_fd) { + return NULL; + } + + struct msghdr *msg = create_credentials_msg(); + struct iovec iov; + iov.iov_base = (void *) string; + iov.iov_len = strlen(string); + msg->msg_iov = &iov; + sendmsg(socket_fd, msg, 0); + + /* + * This is just...horrible. Mental note to replace this + * With a select, ioctl to determine size, and malloc based + * on that + */ + char *data = malloc(8192); + int len = recv(socket_fd, data, 8192, 0); + close(socket_fd); + data[len] = '\0'; + free(msg->msg_control); + free(msg); + return data; +} + +void parse_setup(char *setup_data) +{ + char *token, *ptr; + int i,j; + char *copy; + irq_t *new_irq = NULL; + if((setup_data == NULL) || (strlen(setup_data) == 0)) return; + copy = strdup(setup_data); + if (!copy) + return; + + setup.banned_irqs = NULL; + setup.banned_cpus = NULL; + token = strtok_r(copy, " ", &ptr); + if(strncmp(token, "SLEEP", strlen("SLEEP"))) goto out; + setup.sleep = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + /* Parse banned IRQ data */ + while(!strncmp(token, "IRQ", strlen("IRQ"))) { + new_irq = malloc(sizeof(irq_t)); + new_irq->vector = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "LOAD", strlen("LOAD"))) goto out; + new_irq->load = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "DIFF", strlen("DIFF"))) goto out; + new_irq->diff = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(ptr, " ", &ptr); + if(strncmp(token, "CLASS", strlen("CLASS"))) goto out; + new_irq->class = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + new_irq->is_banned = 1; + new_irq->assigned_to = NULL; + setup.banned_irqs = g_list_append(setup.banned_irqs, new_irq); + token = strtok_r(NULL, " ", &ptr); + new_irq = NULL; + } + + if(strncmp(token, "BANNED", strlen("BANNED"))) goto out; + token = strtok_r(NULL, " ", &ptr); + for(i = strlen(token) - 1; i >= 0; i--) { + char *map = hex_to_bitmap(token[i]); + for(j = 3; j >= 0; j--) { + if(map[j] == '1') { + uint64_t *banned_cpu = malloc(sizeof(uint64_t)); + *banned_cpu = (4 * (strlen(token) - (i + 1)) + (4 - (j + 1))); + setup.banned_cpus = g_list_append(setup.banned_cpus, + banned_cpu); + } + } + free(map); + + } + free(copy); + return; + +out: { + /* Invalid data presented */ + printf("Invalid data sent. Unexpected token: %s", token); + if (new_irq) { + free(new_irq); + } + free(copy); + g_list_free(tree); + exit(1); +} +} + +GList * concat_child_lists(cpu_node_t *node) +{ + GList *new = NULL; + GList *child_entry = g_list_first(node->children); + do { + cpu_node_t *child = (cpu_node_t *)child_entry->data; + GList *cpu_entry = g_list_first(child->cpu_list); + do { + uint64_t *cpu = (uint64_t *)cpu_entry->data; + new = g_list_append(new, cpu); + cpu_entry = g_list_next(cpu_entry); + } while(cpu_entry != NULL); + child_entry = g_list_next(child_entry); + } while(child_entry != NULL); + + return new; +} + +void copy_cpu_list_to_irq(irq_t *irq, void *data) +{ + irq->assigned_to = g_list_copy((GList *)data); + irq->assigned_to = g_list_sort(irq->assigned_to, sort_ints); +} + +void assign_cpu_lists(cpu_node_t *node, void *data __attribute__((unused))) +{ + if(g_list_length(node->children) > 0) { + for_each_node(node->children, assign_cpu_lists, NULL); + node->cpu_list = concat_child_lists(node); + } else { + node->cpu_list = g_list_append(node->cpu_list, &(node->number)); + } + + for_each_irq(node->irqs, copy_cpu_list_to_irq, node->cpu_list); +} + +void assign_cpu_mask(cpu_node_t *node, void *data __attribute__((unused))) +{ + char *mask = malloc(16 * sizeof(char)); + mask[0] = '\0'; + unsigned int sum = 0; + GList *list_entry = g_list_first(node->cpu_list); + do { + int *cpu = list_entry->data; + sum += 1 << (*cpu); + list_entry = g_list_next(list_entry); + } while(list_entry != NULL); + snprintf(mask, 15, "0x%x", sum); + node->cpu_mask = mask; + + if(g_list_length(node->children) > 0) { + for_each_node(node->children, assign_cpu_mask, NULL); + } +} + +void parse_into_tree(char *data) +{ + char *token, *ptr; + cpu_node_t *parent = NULL; + char *copy; + tree = NULL; + irq_t *new_irq = NULL; + cpu_node_t *new = NULL; + + if (!data || strlen(data) == 0) + return; + + copy = strdup(data); + if (!copy) + return; + + token = strtok_r(copy, " ", &ptr); + while(token != NULL) { + /* Parse node data */ + if(strncmp(token, "TYPE", strlen("TYPE"))) { + free(copy); + goto out; + } + new = malloc(sizeof(cpu_node_t)); + new->irqs = NULL; + new->children = NULL; + new->cpu_list = NULL; + new->cpu_mask = NULL; + new->type = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + if(new->type == OBJ_TYPE_NODE) { + parent = NULL; + } else if(new->type >= parent->type) { + parent = parent->parent; + } + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "NUMBER", strlen("NUMBER"))) goto out; + new->number = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "LOAD", strlen("LOAD"))) goto out; + new->load = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "SAVE_MODE", strlen("SAVE_MODE"))) goto out; + new->is_powersave = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + + /* Parse assigned IRQ data */ + while((token != NULL) && (!strncmp(token, "IRQ", strlen("IRQ")))) { + new_irq = malloc(sizeof(irq_t)); + new_irq->vector = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "LOAD", strlen("LOAD"))) goto out; + new_irq->load = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "DIFF", strlen("DIFF"))) goto out; + new_irq->diff = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + token = strtok_r(NULL, " ", &ptr); + if(strncmp(token, "CLASS", strlen("CLASS"))) goto out; + new_irq->class = strtol(strtok_r(NULL, " ", &ptr), NULL, 10); + new_irq->is_banned = 0; + new->irqs = g_list_append(new->irqs, new_irq); + token = strtok_r(NULL, " ", &ptr); + new_irq = NULL; + } + + if((token == NULL) || (strncmp(token, "IRQ", strlen("IRQ")))) { + new->parent = parent; + if(parent == NULL) { + tree = g_list_append(tree, new); + } else { + parent->children = g_list_append(parent->children, new); + } + if(new->type != OBJ_TYPE_CPU) { + parent = new; + } + } + + new = NULL; + } + free(copy); + for_each_node(tree, assign_cpu_lists, NULL); + for_each_node(tree, assign_cpu_mask, NULL); + return; + +out: { + /* Invalid data presented */ + printf("Invalid data sent. Unexpected token: %s\n", token); + if (new_irq) { + free(new_irq); + } + if (new) { + free(new); + } + g_list_free(tree); + exit(1); +} +} + +gboolean rescan_tree(gpointer data __attribute__((unused))) +{ + char *setup_data = get_data(SETUP); + parse_setup(setup_data); + char *irqbalance_data = get_data(STATS); + parse_into_tree(irqbalance_data); + if(is_tree) { + display_tree(); + } + free(setup_data); + free(irqbalance_data); + return TRUE; +} + +gboolean key_loop(gpointer data __attribute__((unused))) +{ + int c = getch(); + switch(c) { + case 'q': + close_window(0); + break; + case KEY_F(3): + is_tree = 1; + display_tree(); + break; + case KEY_F(4): + is_tree = 0; + settings(); + break; + case KEY_F(5): + is_tree = 0; + setup_irqs(); + break; + default: + break; + } + return TRUE; +} + +int main(int argc, char **argv) +{ + if(getuid() != 0) { + printf("This program needs to be executed with root privileges\n"); + return EACCES; + } + if(argc > 1) { + /* PID of irqbalance specified */ + irqbalance_pid = strtol(argv[1], NULL, 10); + if(!irqbalance_pid) { + printf("PID must be a number\n"); + return EINVAL; + } + } else { + /* We need to find irqbalance's PID */ + DIR *dir = opendir("/proc"); + if(dir) { + struct dirent *entry; + char cmdfile[512]; + char cmdstring[256]; + cmdstring[255] = '\0'; + do { + entry = readdir(dir); + if(entry) { + snprintf(cmdfile, 512, "/proc/%s/cmdline", entry->d_name); + FILE *f = fopen(cmdfile, "r"); + if(f == NULL) { + continue; + } + fgets(cmdstring, 255, f); + if((strstr(cmdstring, "irqbalance") != NULL) && + (strstr(cmdstring, "irqbalance-ui") == NULL)) { + irqbalance_pid = strtol(entry->d_name, NULL, 10); + } + fclose(f); + } + } while((entry) && (irqbalance_pid == -1)); + closedir(dir); + } + if(irqbalance_pid == -1) { + printf("Unable to determine irqbalance PID\n"); + return EINVAL; + } + } + + init(); + + main_loop = g_main_loop_new(NULL, FALSE); + g_timeout_add_seconds(5, rescan_tree, NULL); + g_timeout_add_seconds(1, key_loop, NULL); + g_main_loop_run(main_loop); + + + g_main_loop_quit(main_loop); + close_window(0); + return 0; +}
diff --git a/ui/irqbalance-ui.h b/ui/irqbalance-ui.h new file mode 100644 index 0000000..fba7e7c --- /dev/null +++ b/ui/irqbalance-ui.h
@@ -0,0 +1,87 @@ + +#ifndef IRQBALANCE_UI_H +#define IRQBALANCE_UI_H + +#include <stdio.h> +#include <stdint.h> +#include <glib.h> +#include <glib-unix.h> + +#define SOCKET_PATH "irqbalance" +#define SOCKET_TMPFS "/run/irqbalance" + +#define STATS "stats" +#define SET_SLEEP "settings sleep " +#define BAN_IRQS "settings ban irqs " +#define SETUP "setup" + +/* IRQ CLASSES (same as irqbalance uses) */ +#define IRQ_NODEF -1 +#define IRQ_OTHER 0 +#define IRQ_LEGACY 1 +#define IRQ_SCSI 2 +#define IRQ_VIDEO 3 +#define IRQ_ETH 4 +#define IRQ_GBETH 5 +#define IRQ_10GBETH 6 +#define IRQ_VIRT_EVENT 7 + + +/* Typedefs */ + +typedef enum node_type { + OBJ_TYPE_CPU, + OBJ_TYPE_CACHE, + OBJ_TYPE_PACKAGE, + OBJ_TYPE_NODE +} node_type_e; + +typedef struct irq { + int vector; + uint64_t load; + uint64_t diff; + char is_banned; + GList *assigned_to; + int class; +} irq_t; + +typedef struct cpu_node { + node_type_e type; + int number; + uint64_t load; + int is_powersave; + struct cpu_node *parent; + GList *children; + GList *irqs; + GList *cpu_list; + char *cpu_mask; +} cpu_node_t; + +typedef struct cpu_ban { + int number; + char is_banned; +} cpu_ban_t; + +typedef struct setup { + uint64_t sleep; + GList *banned_irqs; + GList *banned_cpus; +} setup_t; + +/* Function prototypes */ + +struct msghdr * create_credentials_msg(); +int init_connection(); +void send_settings(char *data); +char * get_data(char *string); +void parse_setup(char *setup_data); +GList * concat_child_lists(cpu_node_t *node); +void copy_cpu_list_to_irq(irq_t *irq, void *data); +void assign_cpu_lists(cpu_node_t *node, void *data); +void assign_cpu_mask(cpu_node_t *node, void *data); +void parse_into_tree(char *data); +gboolean rescan_tree(gpointer data); +int main(); + + +#endif /* IRQBALANCE_UI_H */
diff --git a/ui/ui.c b/ui/ui.c new file mode 100644 index 0000000..06ec472 --- /dev/null +++ b/ui/ui.c
@@ -0,0 +1,736 @@ + +#include <string.h> +#include "ui.h" + + +GList *all_cpus = NULL; +GList *all_irqs = NULL; + +char *IRQ_CLASS_TO_STR[] = { + "Other", + "Legacy", + "SCSI", + "Video", + "Ethernet", + "Gigabit Ethernet", + "10-Gigabit Ethernet," + "Virt Event"}; + +void show_frame() +{ + int i; + attrset(COLOR_PAIR(4)); + char top[COLS]; + top[0] = '\0'; + while(strlen(top) != (size_t)COLS - 1) { + snprintf(top + strlen(top), COLS - strlen(top), " "); + } + mvprintw(0, 0, top); + for(i = 0; i < LINES; i++) { + mvprintw(i, 0, " "); + mvprintw(i, COLS - 1, " "); + } +} + +void show_footer() +{ + char footer[COLS]; + snprintf(footer, COLS - 1, + " q (QUIT) F3 (TREE) F4 (SETTINGS) F5 (SETUP IRQS)"); + while(strlen(footer) != (size_t)COLS - 1) { + snprintf(footer + strlen(footer), COLS - strlen(footer), " "); + } + attrset(COLOR_PAIR(4)); + mvprintw(LINES - 1, 0, footer); +} + +char * check_control_in_sleep_input(int max_len, int column_offest, int line_offset) +{ + char *input_to = malloc(max_len * sizeof(char)); + int iteration = 0; + while(iteration < max_len) { + int new = getch(); + switch(new) { + case ERR: + /* No input is ready for nonblocking getch() call */ + break; + case '\r': + case '\n': + input_to[iteration] = '\0'; + return input_to; + case 'q': + close_window(0); + break; + case KEY_BACKSPACE: + if(iteration > 0) { + attrset(COLOR_PAIR(5)); + iteration--; + mvaddch(line_offset, column_offest + iteration, ' '); + } + move(line_offset, column_offest + iteration); + attrset(COLOR_PAIR(6)); + break; + case 27: + free(input_to); + return NULL; + default: + input_to[iteration] = new; + iteration++; + break; + } + } + return input_to; +} + +int get_valid_sleep_input(int column_offest) +{ + uint64_t new_sleep = setup.sleep; + while(1) { + attrset(COLOR_PAIR(5)); + mvprintw(2, column_offest, " "); + attrset(COLOR_PAIR(6)); + refresh(); + move(2, column_offest); + curs_set(1); + char *input = check_control_in_sleep_input(20, column_offest, 3); + if(input == NULL) { + curs_set(0); + attrset(COLOR_PAIR(1)); + mvprintw(2, column_offest, "%lu ", new_sleep); + move(LINES, COLS); + break; + } + attrset(COLOR_PAIR(1)); + mvprintw(LINES - 2, 1, " "); + curs_set(0); + refresh(); + char *error; + new_sleep = strtol(input, &error, 10); + if((*error == '\0') && (new_sleep >= 1)) { + break; + } else { + new_sleep = setup.sleep; + attrset(COLOR_PAIR(4)); + mvprintw(LINES - 2, 1, + "Invalid input: %s ", + input); + refresh(); + } + free(input); + } + + attrset(COLOR_PAIR(1)); + mvprintw(2, column_offest, "%lu ", new_sleep); + + return new_sleep; +} + +void get_banned_cpu(int *cpu, void *data __attribute__((unused))) +{ + cpu_ban_t *new = malloc(sizeof(cpu_ban_t)); + new->number = *cpu; + new->is_banned = 1; + all_cpus = g_list_append(all_cpus, new); +} + +void print_cpu_line(cpu_ban_t *cpu, void *data) +{ + int *line_offset = data; + if(cpu->is_banned) { + attrset(COLOR_PAIR(10)); + } else { + attrset(COLOR_PAIR(9)); + } + mvprintw(*line_offset, 3, "CPU %d", cpu->number); + mvprintw(*line_offset, 19, "%s", cpu->is_banned ? + "YES " : + "NO "); + (*line_offset)++; +} + +void print_all_cpus() +{ + if(all_cpus == NULL) { + for_each_node(tree, get_cpu, NULL); + for_each_int(setup.banned_cpus, get_banned_cpu, NULL); + all_cpus = g_list_sort(all_cpus, sort_all_cpus); + } + int *line = malloc(sizeof(int)); + *line = 6; + attrset(COLOR_PAIR(2)); + mvprintw(4, 3, "NUMBER IS BANNED"); + for_each_cpu(all_cpus, print_cpu_line, line); +} + +void add_banned_cpu(int *banned_cpu, void *data) +{ + snprintf(data + strlen(data), 1024 - strlen(data), "%d, ", *banned_cpu); +} + +void display_banned_cpus() +{ + char banned_cpus[1024] = "Banned CPU numbers: \0"; + if(g_list_length(setup.banned_cpus) > 0) { + for_each_int(setup.banned_cpus, add_banned_cpu, banned_cpus); + snprintf(banned_cpus + strlen(banned_cpus) - 2, + 1024 - strlen(banned_cpus), "\n"); + } else { + snprintf(banned_cpus + strlen(banned_cpus), + 1024 - strlen(banned_cpus), "None\n"); + } + attrset(COLOR_PAIR(0)); + mvprintw(2, 5, "%s\n", banned_cpus); +} + +int toggle_cpu(GList *cpu_list, int cpu_number) +{ + GList *entry = g_list_first(cpu_list); + cpu_ban_t *entry_data = (cpu_ban_t *)(entry->data); + while(entry_data->number != cpu_number) { + entry = g_list_next(entry); + entry_data = (cpu_ban_t *)(entry->data); + } + if(((cpu_ban_t *)(entry->data))->is_banned) { + ((cpu_ban_t *)(entry->data))->is_banned = 0; + } else { + ((cpu_ban_t *)(entry->data))->is_banned = 1; + } + return ((cpu_ban_t *)(entry->data))->is_banned; +} + +void get_new_cpu_ban_values(cpu_ban_t *cpu, void *data) +{ + char *mask_data = (char *)data; + if(cpu->is_banned) { + snprintf(mask_data + strlen(mask_data), 1024 - strlen(mask_data), + "%d,", cpu->number); + } +} + +void get_cpu(cpu_node_t *node, void *data __attribute__((unused))) +{ + if(node->type == OBJ_TYPE_CPU) { + cpu_ban_t *new = malloc(sizeof(cpu_ban_t)); + new->number = node->number; + new->is_banned = 0; + all_cpus = g_list_append(all_cpus, new); + } + if(g_list_length(node->children) > 0) { + for_each_node(node->children, get_cpu, NULL); + } +} + +void handle_cpu_banning() +{ + GList *tmp = g_list_copy_deep(all_cpus, copy_cpu_ban, NULL); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 3, 1, "Move up and down the list, toggle ban with Enter."); + mvprintw(LINES - 2, 1, + "Press ESC for discarding and <S> for saving the values."); + move(6, 19); + curs_set(1); + refresh(); + size_t position = 5; + char processing = 1; + while(processing) { + int direction = getch(); + switch(direction) { + case KEY_UP: + if(position > 6) { + position--; + move(position, 19); + } + break; + case KEY_DOWN: + if(position <= g_list_length(all_cpus) + 4) { + position++; + move(position, 19); + } + break; + case '\n': + case '\r': { + attrset(COLOR_PAIR(3)); + int banned = toggle_cpu(tmp, position - 6); + if(banned) { + mvprintw(position, 19, "YES"); + } else { + mvprintw(position, 19, "NO "); + } + move(position, 19); + refresh(); + break; + } + case 27: + processing = 0; + curs_set(0); + /* Forget the changes */ + tmp = g_list_copy_deep(all_cpus, copy_cpu_ban, NULL); + print_all_cpus(); + attrset(COLOR_PAIR(0)); + mvprintw(LINES - 3, 1, " \ + "); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, + "Press <S> for changing sleep setup, <C> for CPU ban setup. "); + move(LINES - 1, COLS - 1); + refresh(); + break; + case 's': + processing = 0; + all_cpus = tmp; + curs_set(0); + print_all_cpus(); + attrset(COLOR_PAIR(0)); + mvprintw(LINES - 3, 1, " \ + "); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, + "Press <S> for changing sleep setup, <C> for CPU ban setup. "); + attrset(COLOR_PAIR(3)); + move(LINES - 1, COLS - 1); + refresh(); + char settings_string[1024] = "settings cpus \0"; + for_each_cpu(all_cpus, get_new_cpu_ban_values, settings_string); + if(!strcmp("settings cpus \0", settings_string)) { + strncpy(settings_string + strlen(settings_string), + "NULL", 1024 - strlen(settings_string)); + } + send_settings(settings_string); + break; + case 'q': + processing = 0; + close_window(0); + break; + case KEY_F(3): + is_tree = 1; + processing = 0; + display_tree(); + break; + case KEY_F(5): + is_tree = 0; + processing = 0; + setup_irqs(); + break; + default: + break; + } + } +} + +void copy_assigned_obj(int *number, void *data) +{ + snprintf(data + strlen(data), 128 - strlen(data), "%d, ", *number); +} + +void print_assigned_objects_string(irq_t *irq, int *line_offset) +{ + if(irq->is_banned) { + return; + } + char assigned_to[128] = "\0"; + for_each_int(irq->assigned_to, copy_assigned_obj, assigned_to); + assigned_to[strlen(assigned_to) - 2] = '\0'; + mvprintw(*line_offset, 36, assigned_to); +} + +void print_irq_line(irq_t *irq, void *data) +{ + int *line_offset = data; + switch(irq->class) { + case(IRQ_OTHER): + attrset(COLOR_PAIR(1)); + break; + case(IRQ_LEGACY): + attrset(COLOR_PAIR(2)); + break; + case(IRQ_SCSI): + attrset(COLOR_PAIR(3)); + break; + case(IRQ_VIDEO): + attrset(COLOR_PAIR(8)); + break; + case(IRQ_ETH): + case(IRQ_GBETH): + case(IRQ_10GBETH): + attrset(COLOR_PAIR(9)); + break; + case(IRQ_VIRT_EVENT): + attrset(COLOR_PAIR(10)); + break; + default: + attrset(COLOR_PAIR(0)); + break; + } + mvprintw(*line_offset, 3, "IRQ %d", irq->vector); + mvprintw(*line_offset, 19, "%s", irq->is_banned ? "YES" : "NO "); + print_assigned_objects_string(irq, line_offset); + mvprintw(*line_offset, 84, "%s", + irq->class < 0 ? "Unknown" : IRQ_CLASS_TO_STR[irq->class]); + (*line_offset)++; + +} + +void print_all_irqs() +{ + int *line = malloc(sizeof(int)); + *line = 4; + attrset(COLOR_PAIR(0)); + mvprintw(2, 3, + "NUMBER IS BANNED ASSIGNED TO CPUS \ + CLASS"); + for_each_irq(all_irqs, print_irq_line, line); +} + +int toggle_irq(GList *irq_list, int position) +{ + GList *entry = g_list_first(irq_list); + int irq_node = 0; + while(irq_node != position) { + entry = g_list_next(entry); + irq_node++; + } + if(((irq_t *)(entry->data))->is_banned) { + ((irq_t *)(entry->data))->is_banned = 0; + } else { + ((irq_t *)(entry->data))->is_banned = 1; + } + return ((irq_t *)(entry->data))->is_banned; +} + +void get_new_irq_ban_values(irq_t *irq, void *data) +{ + char *ban_list = (char *)data; + if(irq->is_banned) { + snprintf(ban_list + strlen(ban_list), 1024 - strlen(ban_list), + " %d", irq->vector); + } +} + +void copy_irqs_from_nodes(cpu_node_t *node, void *data __attribute__((unused))) +{ + if(g_list_length(node->irqs) > 0) { + GList *new = g_list_copy_deep(node->irqs, copy_irq, NULL); + all_irqs = g_list_concat(all_irqs, new); + } + if(g_list_length(node->children) > 0) { + for_each_node(node->children, copy_irqs_from_nodes, all_irqs); + } +} + +void get_all_irqs() +{ + all_irqs = g_list_copy_deep(setup.banned_irqs, copy_irq, NULL); + for_each_node(tree, copy_irqs_from_nodes, NULL); +} + +void handle_irq_banning() +{ + GList *tmp = g_list_copy_deep(all_irqs, copy_irq, NULL); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 3, 1, "Move up and down the list, toggle ban with Enter."); + mvprintw(LINES - 2, 1, "Press ESC for discarding and <S> for saving the values."); + move(4, 19); + curs_set(1); + refresh(); + size_t position = 3; + char processing = 1; + while(processing) { + int direction = getch(); + switch(direction) { + case KEY_UP: + if(position > 4) { + position--; + move(position, 19); + } + break; + case KEY_DOWN: + if(position < g_list_length(all_irqs) + 3) { + position++; + move(position, 19); + } + break; + case '\n': + case '\r': { + attrset(COLOR_PAIR(3)); + int banned = toggle_irq(tmp, position - 4); + if(banned) { + mvprintw(position, 19, "YES"); + } else { + mvprintw(position, 19, "NO "); + } + move(position, 19); + refresh(); + break; + } + case 27: + processing = 0; + curs_set(0); + /* Forget the changes */ + tmp = g_list_copy_deep(all_irqs, copy_irq, NULL); + print_all_irqs(); + attrset(COLOR_PAIR(0)); + mvprintw(LINES - 3, 1, " \ + "); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, "Press <I> for setting up IRQ banning.\ + "); + move(LINES - 1, COLS - 1); + refresh(); + break; + case 's': + processing = 0; + all_irqs = tmp; + curs_set(0); + print_all_irqs(); + attrset(COLOR_PAIR(0)); + mvprintw(LINES - 3, 1, " \ + "); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, "Press <I> for setting up IRQ banning.\ + "); + attrset(COLOR_PAIR(3)); + move(LINES - 1, COLS - 1); + refresh(); + char settings_string[1024] = BAN_IRQS; + for_each_irq(all_irqs, get_new_irq_ban_values, settings_string); + if(!strcmp(BAN_IRQS, settings_string)) { + strncpy(settings_string + strlen(settings_string), + " NONE", 1024 - strlen(settings_string)); + } + send_settings(settings_string); + break; + case 'q': + processing = 0; + close_window(0); + break; + case KEY_F(3): + is_tree = 1; + processing = 0; + display_tree(); + break; + case KEY_F(4): + is_tree = 0; + processing = 0; + settings(); + break; + default: + break; + } + } +} + +void init() +{ + signal(SIGINT, close_window); + initscr(); + keypad(stdscr, TRUE); + curs_set(0); + nonl(); + cbreak(); + nodelay(stdscr, TRUE); + echo(); + if(has_colors()) { + start_color(); + init_pair(1, COLOR_RED, COLOR_BLACK); + init_pair(2, COLOR_YELLOW, COLOR_BLACK); + init_pair(3, COLOR_GREEN, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLUE); + init_pair(5, COLOR_WHITE, COLOR_RED); + init_pair(6, COLOR_RED, COLOR_WHITE); + init_pair(7, COLOR_BLACK, COLOR_CYAN); + init_pair(8, COLOR_BLUE, COLOR_BLACK); + init_pair(9, COLOR_CYAN, COLOR_BLACK); + init_pair(10, COLOR_MAGENTA, COLOR_BLACK); + } + + display_tree(); +} + +void close_window(int sig __attribute__((unused))) +{ + g_list_free(setup.banned_irqs); + g_list_free(setup.banned_cpus); + g_list_free_full(tree, free); + endwin(); + exit(EXIT_SUCCESS); +} + +void settings() +{ + clear(); + char *setup_data = get_data(SETUP); + parse_setup(setup_data); + + char info[128] = "Current sleep interval between rebalancing: \0"; + uint8_t sleep_input_offset = strlen(info) + 3; + snprintf(info + strlen(info), 128 - strlen(info), "%lu\n", setup.sleep); + attrset(COLOR_PAIR(1)); + mvprintw(2, 3, info); + print_all_cpus(); + + int user_input = 1; + while(user_input) { + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, + "Press <S> for changing sleep setup, <C> for CPU ban setup. "); + show_frame(); + show_footer(); + refresh(); + int c = getch(); + switch(c) { + case 's': { + mvprintw(LINES - 1, 1, "Press ESC for discarding your input.\ + "); + attrset(COLOR_PAIR(0)); + mvprintw(LINES - 2, 1, " \ + "); + uint64_t new_sleep = get_valid_sleep_input(sleep_input_offset); + if(new_sleep != setup.sleep) { + setup.sleep = new_sleep; + char settings_data[128]; + snprintf(settings_data, 128, "%s %lu", SET_SLEEP, new_sleep); + send_settings(settings_data); + } + break; + } + case 'c': + handle_cpu_banning(); + break; + /* We need to include window changing options as well because the + * related char was eaten up by getch() already */ + case 'q': + user_input = 0; + close_window(0); + break; + case KEY_F(3): + is_tree = 1; + user_input = 0; + display_tree(); + break; + case KEY_F(5): + is_tree = 0; + user_input = 0; + setup_irqs(); + break; + default: + break; + } + } + free(setup_data); +} + +void setup_irqs() +{ + clear(); + get_all_irqs(); + all_irqs = g_list_sort(all_irqs, sort_all_irqs); + print_all_irqs(); + attrset(COLOR_PAIR(5)); + mvprintw(LINES - 2, 1, "Press <I> for setting up IRQ banning."); + show_frame(); + show_footer(); + refresh(); + + int user_input = 1; + while(user_input) { + int c = getch(); + switch(c) { + case 'i': + handle_irq_banning(); + break; + case 'q': + user_input = 0; + close_window(0); + break; + case KEY_F(3): + is_tree = 1; + user_input = 0; + display_tree(); + break; + case KEY_F(4): + is_tree = 0; + user_input = 0; + settings(); + break; + default: + break; + } + } +} + +void display_tree_node_irqs(irq_t *irq, void *data) +{ + char indent[32] = " \0"; + snprintf(indent + strlen(indent), 32 - strlen(indent), "%s", (char *)data); + attrset(COLOR_PAIR(3)); + printw("%sIRQ %lu, IRQs since last rebalance %lu\n", + indent, irq->vector, irq->diff); +} + +void display_tree_node(cpu_node_t *node, void *data) +{ + int i; + const char *node_type_to_str[] = { + "CPU\0", + "CACHE DOMAIN\0", + "CPU PACKAGE\0", + "NUMA NODE\0"}; + + char *spaces = " \0"; + char indent[32] = "\0"; + char *asciitree = " `--\0"; + for(i = node->type; i <= OBJ_TYPE_NODE; i++) { + snprintf(indent + strlen(indent), 32 - strlen(indent), "%s", spaces); + if(i != OBJ_TYPE_NODE) { + snprintf(indent + strlen(indent), 32 - strlen(indent), " "); + } + } + snprintf(indent + strlen(indent), 32 - strlen(indent), "%s", asciitree); + char copy_to[1024]; + char *numa_available = "\0"; + if((node->type == OBJ_TYPE_NODE) && (node->number == -1)) { + numa_available = " (This machine is not NUMA-capable)"; + } + snprintf(copy_to, 1024, "%s%s, number %d%s, CPU mask %s\n", + indent, node_type_to_str[node->type], node->number, numa_available, + node->cpu_mask); + switch(node->type) { + case(OBJ_TYPE_CPU): + attrset(COLOR_PAIR(1)); + break; + case(OBJ_TYPE_CACHE): + attrset(COLOR_PAIR(2)); + break; + case(OBJ_TYPE_PACKAGE): + attrset(COLOR_PAIR(8)); + break; + case(OBJ_TYPE_NODE): + attrset(COLOR_PAIR(9)); + break; + default: + break; + } + printw(copy_to); + if(g_list_length(node->irqs) > 0) { + for_each_irq(node->irqs, display_tree_node_irqs, indent); + } + if(g_list_length(node->children)) { + for_each_node(node->children, display_tree_node, data); + } +} + +void display_tree() +{ + clear(); + char *setup_data = get_data(SETUP); + parse_setup(setup_data); + char *irqbalance_data = get_data(STATS); + parse_into_tree(irqbalance_data); + display_banned_cpus(); + for_each_node(tree, display_tree_node, NULL); + show_frame(); + show_footer(); + refresh(); + free(setup_data); + free(irqbalance_data); +}
diff --git a/ui/ui.h b/ui/ui.h new file mode 100644 index 0000000..0aa8280 --- /dev/null +++ b/ui/ui.h
@@ -0,0 +1,53 @@ + +#ifndef UI_H +#define UI_H + +#include <glib.h> +#include <glib-unix.h> +#include <curses.h> +#include <form.h> +#include <ncurses.h> +#include <signal.h> +#include "irqbalance-ui.h" +#include "helpers.h" + +extern GList *tree; +extern setup_t setup; +extern int is_tree; + +void show_frame(); +void show_footer(); + +char * check_control_in_sleep_input(int max_len, int column_offest, int line_offset); +int get_valid_sleep_input(int column_offest); + +void get_banned_cpu(int *cpu, void *data); +void print_cpu_line(cpu_ban_t *cpu, void *data); +void print_all_cpus(); +void add_banned_cpu(int *banned_cpu, void *data); +void display_banned_cpus(); +int toggle_cpu(GList *cpu_list, int cpu_number); +void get_new_cpu_ban_values(cpu_ban_t *cpu, void *data); +void get_cpu(); +void handle_cpu_banning(); + +void copy_assigned_obj(int *number, void *data); +void print_assigned_objects_string(irq_t *irq, int *line_offset); +void print_irq_line(irq_t *irq, void *data); +void print_all_irqs(); +int toggle_irq(GList *irq_list, int position); +void get_new_irq_ban_values(irq_t *irq, void *data); +void copy_irqs_from_nodes(cpu_node_t *node, void *data); +void get_all_irqs(); +void handle_irq_banning(); + +void init(); +void close_window(int sig); +void settings(); +void setup_irqs(); +void display_tree_node_irqs(irq_t *irq, void *data); +void display_tree_node(cpu_node_t *node, void *data); +void display_tree(); + + +#endif /* UI_H */