pe_counting/ocount_counter.cpp - nest-cam/v350/oprofile - Git at Google

 /**
  * @file ocount_counter.cpp
  * Functions and classes for ocount tool.
  *
  * @remark Copyright 2013 OProfile authors
  * @remark Read the file COPYING
  *
  * Created on: May 22, 2013
  * @author Maynard Johnson
  * (C) Copyright IBM Corp. 2013
  *
  */
 #include <time.h>
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
 #include <dirent.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <signal.h>

 #include <iostream>
 #include <sstream>
 #include <stdexcept>

 #include "ocount_counter.h"
 #include "op_pe_utils.h"
 #include "operf_event.h"
 #include "cverb.h"

 extern verbose vdebug;
 extern bool use_cpu_minus_one;
 extern char * app_name;

 using namespace std;


 ocount_counter::ocount_counter(operf_event_t & evt,  bool enable_on_exec,
                                bool inherit)
 {
 	memset(&attr, 0, sizeof(attr));
 	attr.size = sizeof(attr);
 	attr.type = PERF_TYPE_RAW;
 	attr.config = evt.evt_code;
 	attr.inherit = inherit ? 1 : 0;
 	attr.enable_on_exec = enable_on_exec ? 1 : 0;
 	attr.disabled  = attr.enable_on_exec;
 	attr.exclude_idle = 0;
 	attr.exclude_kernel = evt.no_kernel;
 	attr.exclude_user = evt.no_user;
 	attr.exclude_hv = evt.no_hv;
 	// This format allows us to tell user percent of time an event was scheduled
 	// when multiplexing has been done by the kernel.
 	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
 			    PERF_FORMAT_TOTAL_TIME_RUNNING;
 	event_name = evt.name;
 	fd = cpu = pid = -1;
 }

 ocount_counter::~ocount_counter() {
 }

 #include <stdio.h>
 int ocount_counter::perf_event_open(pid_t _pid, int _cpu)
 {
 	fd = op_perf_event_open(&attr, _pid, _cpu, -1, 0);
 	if (fd < 0) {
 		int ret = -1;
 		cverb << vdebug << "perf_event_open failed: " << strerror(errno) << endl;
 		if (errno == EBUSY) {
 			cerr << "The performance monitoring hardware reports EBUSY. Is another profiling tool in use?" << endl
 			     << "On some architectures, tools such as oprofile and perf being used in system-wide "
 			     << "mode can cause this problem." << endl;
 			ret = OP_PERF_HANDLED_ERROR;
 		} else if (errno == ESRCH) {
 			cerr << "!!!! No samples collected !!!" << endl;
 			cerr << "The target program/command ended before profiling was started." << endl;
 			ret = OP_PERF_HANDLED_ERROR;
 		} else {
 			cerr << "perf_event_open failed with " << strerror(errno) << endl;
 		}
 		return ret;
 	}
 	pid = _pid;
 	cpu = _cpu;

 	cverb << vdebug << "perf_event_open returning fd " << fd << endl;
 	return fd;
 }

 int ocount_counter::read_count_data(ocount_accum_t * count_data)
 {
 	size_t len = 3 * sizeof(u64);
 	char * buf = (char *)count_data;

 	while (len) {
 		int ret = read(fd, buf, len);

 		if (ret <= 0)
 			return ret;

 		len -= ret;
 		buf += ret;
 	}

 	return 0;
 }

 ocount_record::ocount_record(enum op_runmode _runmode, std::vector<operf_event_t> & _evts,
                              bool _with_time_interval)
 {
 	runmode = _runmode;
 	with_time_interval = _with_time_interval;
 	evts = _evts;
 	valid = false;
 	system_wide = false;
 	tasks_are_threads = false;
 	num_cpus = 0;
 	app_pid = -1;
 }

 bool ocount_record::start_counting_app_process(pid_t _pid)
 {
 	if (valid) {
 		cerr << "ocount internal error: ocount_record already initialized" << endl;
 		return false;
 	}
 	if (runmode != OP_START_APP) {
 		cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
 		     "starting app." << endl;
 		return false;
 	}
 	app_pid = _pid;
 	setup();
 	return true;
 }

 /*
  * There are separate ocount options for counting events for a set of processes ("--process-list")
  * or a set of threads ("--thread-list"). This function is used for passing the set of either
  * processes or threads to ocount_record, along with a boolean argument to indicate whether or not
  * the set of passed tasks are threads.  If they are threads, we set up perf_event_open to NOT
  * do "inherit".
  */
 bool ocount_record::start_counting_tasklist(std::vector<pid_t> _tasks, bool _are_threads)
 {
 	if (valid) {
 		cerr << "ocount internal error: ocount_record already initialized" << endl;
 		return false;
 	}
 	tasks_are_threads = _are_threads;
 	specified_tasks = _tasks;
 	if (tasks_are_threads) {
 		if (runmode != OP_THREADLIST) {
 			cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
 			     "--thread-list option." << endl;
 			return false;
 		}
 	} else {
 		if (runmode != OP_PROCLIST) {
 			cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
 			     "--process-list option." << endl;
 			return false;
 		}
 	}
 	setup();
 	if (tasks_to_count.empty()) {
 		cerr << "No valid tasks to monitor -- quitting." << endl;
 		return false;
 	}
 	return true;
 }

 bool ocount_record::start_counting_cpulist(std::vector<int> _cpus)
 {
 	if (valid) {
 		cerr << "ocount internal error: ocount_record already initialized" << endl;
 		return false;
 	}
 	if (runmode != OP_CPULIST) {
 		cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
 		     "--cpu-list option." << endl;
 		return false;
 	}
 	specified_cpus = _cpus;
 	setup();
 	return true;
 }

 bool ocount_record::start_counting_syswide(void)
 {
 	if (valid) {
 		cerr << "ocount internal error: ocount_record already initialized" << endl;
 		return false;
 	}
 	if (runmode != OP_SYSWIDE) {
 		cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
 		     "--system-wide option." << endl;
 		return false;
 	}
 	system_wide = true;
 	setup();
 	return true;
 }

 int ocount_record::do_counting_per_task(void)
 {
 	string err_msg;
 	int rc = 0;

 	for (set<pid_t>::iterator it = tasks_to_count.begin(); it != tasks_to_count.end(); it++) {
 		pid_t the_pid = *it;
 		bool inherit = are_tasks_processes();
 		cverb << vdebug << "calling perf_event_open for task " << the_pid << endl;
 		for (unsigned event = 0; event < evts.size(); event++) {
 			ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
 			accum_counts.push_back(count_data);
 			prev_accum_counts.push_back(0ULL);
 			ocount_counter op_ctr(ocount_counter(evts[event], false, inherit));
 			if ((rc = op_ctr.perf_event_open(the_pid, -1)) < 0) {
 				err_msg = "Internal Error.  Perf event setup failed.";
 				goto out;
 			} else {
 				rc = 0;
 			}
 			perfCounters.push_back(op_ctr);
 		}
 	}
 out:
 	if (rc && rc != OP_PERF_HANDLED_ERROR)
 		throw runtime_error(err_msg);
 	return rc;
 }

 int ocount_record::do_counting_per_cpu(void)
 {
 	string err_msg;
 	int rc = 0;

 	/* We'll do this sanity check here, but we also do it at the front-end where user
 	 * args are being validated.  If we wait until we get here, the invalid CPU argument
 	 * becomes an ugly thrown exception.
 	 */
 	set<int> available_cpus = op_pe_utils::op_get_available_cpus(num_cpus);
 	if (runmode == OP_CPULIST) {
 		size_t k;
 		for (k = 0; k < specified_cpus.size(); k++) {
 			if (available_cpus.find(specified_cpus[k]) == available_cpus.end()) {
 				ostringstream err_msg_ostr;
 				err_msg_ostr << "Specified CPU " << specified_cpus[k] << " is not valid";
 				err_msg = err_msg_ostr.str();
 				rc = -1;
 				goto out;
 			} else {
 				cpus_to_count.insert(specified_cpus[k]);
 			}
 		}
 	} else {
 		cpus_to_count = available_cpus;
 	}

 	for (set<pid_t>::iterator it = cpus_to_count.begin(); it != cpus_to_count.end(); it++) {
 		int the_cpu = *it;
 		cverb << vdebug << "calling perf_event_open for cpu " << the_cpu << endl;
 		for (unsigned event = 0; event < evts.size(); event++) {
 			ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
 			accum_counts.push_back(count_data);
 			prev_accum_counts.push_back(0ULL);
 			ocount_counter op_ctr(ocount_counter(evts[event], false, true));
 			if ((rc = op_ctr.perf_event_open(-1, the_cpu)) < 0) {
 				err_msg = "Internal Error.  Perf event setup failed.";
 				goto out;
 			} else {
 				rc = 0;
 			}
 			perfCounters.push_back(op_ctr);
 		}
 	}
 out:
 	if (rc && rc != OP_PERF_HANDLED_ERROR)
 		throw runtime_error(err_msg);
 	return rc;
 }

 void ocount_record::setup()
 {
 	int rc = 0;
 	string err_msg;

 	if (!specified_tasks.empty()) {
 		if ((rc = get_process_info(specified_tasks)) < 0) {
 			if (rc == OP_PERF_HANDLED_ERROR)
 				return;
 			else
 				throw runtime_error("Unexpected error in ocount_record setup");
 		}
 	}

 	/* To set up to count events for an existing thread group, we need call perf_event_open
 	 * for each thread, and we need to pass cpu=-1 on the syscall.
 	 */
 	use_cpu_minus_one = use_cpu_minus_one ? true : ((system_wide || (runmode == OP_CPULIST)) ? false : true);
 	num_cpus = use_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
 	if (num_cpus < 1) {
 		char int_str[256];
 		sprintf(int_str, "Number of online CPUs is %d; cannot continue", num_cpus);
 		throw runtime_error(int_str);
 	}
 	if (system_wide || (runmode == OP_CPULIST)) {
 		rc = do_counting_per_cpu();
 	} else if (!specified_tasks.empty()) {
 		rc = do_counting_per_task();
 	} else {
 		cverb << vdebug << "calling perf_event_open for pid " << app_pid << endl;
 		for (unsigned event = 0; event < evts.size(); event++) {
 			ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
 			accum_counts.push_back(count_data);
 			prev_accum_counts.push_back(0ULL);
 			ocount_counter op_ctr(ocount_counter(evts[event], true, true));
 			if ((rc = op_ctr.perf_event_open(app_pid, -1)) < 0) {
 				err_msg = "Internal Error.  Perf event setup failed.";
 				goto error;
 			} else {
 				rc = 0;
 			}
 			perfCounters.push_back(op_ctr);
 		}
 	}
 	if (!rc) {
 		cverb << vdebug << "perf counter setup complete" << endl;
 		// Set bit to indicate we're set to go.
 		valid = true;
 		// Now that all events are programmed to start counting, init the start time
 		struct timespec tspec;
 		clock_gettime(CLOCK_MONOTONIC, &tspec);
 		start_time = tspec.tv_sec * 1000000000ULL + tspec.tv_nsec;

 		return;
 	}

 error:
 	if (rc != OP_PERF_HANDLED_ERROR)
 		throw runtime_error(err_msg);
 }

 void ocount_record::output_short_results(ostream & out, bool use_separation, bool scaled)
 {
 	size_t num_iterations = use_separation ? perfCounters.size() : evts.size();
 	out << endl;
 	for (size_t num = 0; num < num_iterations; num++) {
 		ostringstream count_str;
 		ocount_accum_t tmp_accum;
 		double fraction_time_running;
 		if (use_separation) {
 			if (cpus_to_count.size()) {
 				out << perfCounters[num].get_cpu();
 			} else {
 				out << perfCounters[num].get_pid();
 			}
 			out << "," << perfCounters[num].get_event_name() << ",";

 			errno = 0;
 			cverb << vdebug << "Reading counter data for event " << perfCounters[num].get_event_name() << endl;
 			if (perfCounters[num].read_count_data(&tmp_accum) < 0) {
 				string err_msg = "Internal error: read of perfCounter fd failed with ";
 				err_msg += errno ? strerror(errno) : "unknown error";
 				throw runtime_error(err_msg);
 			}
 			fraction_time_running = scaled ? (double)tmp_accum.running_time/tmp_accum.enabled_time : 1;
 			if (with_time_interval) {
 				u64 save_prev = prev_accum_counts[num];
 				prev_accum_counts[num] = tmp_accum.count;
 				tmp_accum.count -= save_prev;
 			}
 			u64 scaled_count = tmp_accum.count ? tmp_accum.count/fraction_time_running : 0;
 			out << dec << scaled_count << ",";
 		} else {
 			fraction_time_running = scaled ? (double)accum_counts[num].running_time/accum_counts[num].enabled_time : 1;
 			u64 scaled_count = accum_counts[num].count ? accum_counts[num].count/fraction_time_running : 0;
 			out << perfCounters[num].get_event_name() << "," << dec << scaled_count << ",";
 		}
 		ostringstream strm_tmp;
 		if (use_separation) {
 			if (!tmp_accum.enabled_time) {
 				out << 0 << endl;
 			} else {
 				strm_tmp.precision(2);
 				strm_tmp << fixed << fraction_time_running * 100
 				         << endl;
 				out << strm_tmp.str();
 			}
 		} else {
 			if (!accum_counts[num].enabled_time) {
 				out << "Event not counted" << endl;
 			} else {
 				strm_tmp.precision(2);
 				strm_tmp << fixed << fraction_time_running * 100
 				         << endl;
 				out << strm_tmp.str();
 			}
 		}
 	}
 }

 void ocount_record::output_long_results(ostream & out, bool use_separation,
                                         int longest_event_name, bool scaled, u64 time_enabled)
 {
 #define COUNT_COLUMN_WIDTH 25
 #define SEPARATION_ELEMENT_COLUMN_WIDTH 10
 	char space_padding[64], temp[64];
 	char const * cpu, * task, * scaling;
 	u64 num_seconds_enabled = time_enabled/1000000000;
 	unsigned int num_minutes_enabled = num_seconds_enabled/60;
 	cpu = "CPU";
 	task = "Task ID";
 	scaling = scaled ? "(scaled) " : "(actual) ";

 	// We put 8 spaces between the end of the event name and beginning of the second column
 	unsigned int begin_second_col = longest_event_name + 8;
 	unsigned int num_pads = begin_second_col - strlen("Event");
 	memset(space_padding, ' ', 64);
 	strncpy(temp, space_padding, num_pads);
 	temp[num_pads] = '\0';
 	out << endl;
 	if (!with_time_interval) {
 		ostringstream strm;
 		strm << "Events were actively counted for ";
 		if (num_minutes_enabled) {
 			strm << " ";
 			strm << num_minutes_enabled;
 			if (num_minutes_enabled > 1)
 				strm << " minutes and ";
 			else
 				strm << " minute and ";
 			strm << num_seconds_enabled % 60;
 			strm << " seconds.";
 		} else {
 			if (num_seconds_enabled) {
 				// Show 1/10's of seconds
 				strm.precision(1);
 				strm << fixed << (double)time_enabled/1000000000;
 				strm << " seconds.";
 			} else {
 				// Show full nanoseconds
 				strm << time_enabled << " nanoseconds.";
 			}
 		}
 		out << strm.str() << endl;
 	}
 	out << "Event counts " << scaling;
 	if (app_name)
 		out << "for " << app_name << ":";
 	else if (system_wide)
 		out << "for the whole system:";
 	else if (!cpus_to_count.empty())
 		out << "for the specified CPU(s):";
 	else if (tasks_are_threads)
 		out << "for the specified thread(s):";
 	else
 		out << "for the specified process(es):";
 	out << endl;

 	out << "\tEvent" << temp;
 	if (use_separation) {
 		if (cpus_to_count.size()) {
 			out << cpu;
 			num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - strlen(cpu);
 		} else {
 			out << task;
 			num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - strlen(task);

 		}
 		strncpy(temp, space_padding, num_pads);
 		temp[num_pads] = '\0';
 		out << temp;
 	}
 	out << "Count";
 	num_pads = COUNT_COLUMN_WIDTH - strlen("Count");
 	strncpy(temp, space_padding, num_pads);
 	temp[num_pads] = '\0';
 	out << temp << "% time counted" << endl;

 	/* If counting per-cpu or per-thread, I refer generically to cpu or thread values
 	 * as "elements of separation".  We will have one ocount_counter object per element of
 	 * separation per event.  So if we're counting 2 events for 4 processes (or threads),
 	 * we'll have 2x4 (8) ocount_counter objects.
 	 *
 	 * If 'use_separation' is true, then we need to print individual counts for
 	 * each element of separation for each event; otherwise, we print aggregated counts
 	 * for each event.
 	 */
 	size_t num_iterations = use_separation ? perfCounters.size() : evts.size();
 	for (size_t num = 0; num < num_iterations; num++) {
 		double fraction_time_running;
                 out << "\t" << perfCounters[num].get_event_name();
 		num_pads = begin_second_col - perfCounters[num].get_event_name().length();
 		strncpy(temp, space_padding, num_pads);
 		temp[num_pads] = '\0';
 		out << temp;

 		ostringstream count_str;
 		ocount_accum_t tmp_accum;
 		if (use_separation) {
 			ostringstream separation_element_str;
 			strncpy(temp, space_padding, num_pads);
 			temp[num_pads] = '\0';
 			if (cpus_to_count.size()) {
 				separation_element_str << dec << perfCounters[num].get_cpu();
 				out << perfCounters[num].get_cpu();
 			} else {
 				separation_element_str << dec << perfCounters[num].get_pid();
 				out << perfCounters[num].get_pid();
 			}
 			num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - separation_element_str.str().length();
 			strncpy(temp, space_padding, num_pads);
 			temp[num_pads] = '\0';
 			out << temp;

 			errno = 0;
 			cverb << vdebug << "Reading counter data for event " << perfCounters[num].get_event_name() << endl;
 			if (perfCounters[num].read_count_data(&tmp_accum) < 0) {
 				string err_msg = "Internal error: read of perfCounter fd failed with ";
 				err_msg += errno ? strerror(errno) : "unknown error";
 				throw runtime_error(err_msg);
 			}
 			fraction_time_running = scaled ? (double)tmp_accum.running_time/tmp_accum.enabled_time : 1;

 			if (with_time_interval) {
 				u64 save_prev = prev_accum_counts[num];
 				prev_accum_counts[num] = tmp_accum.count;
 				tmp_accum.count -= save_prev;
 			}
 			u64 scaled_count = tmp_accum.count ? tmp_accum.count/fraction_time_running : 0;
 			count_str << dec << scaled_count;
 		} else {
 			fraction_time_running = scaled ? (double)accum_counts[num].running_time/accum_counts[num].enabled_time : 1;
 			u64 scaled_count = accum_counts[num].count ? accum_counts[num].count/fraction_time_running : 0;
 			count_str << dec << scaled_count;
 		}
 		string count = count_str.str();
 		for (int i = count.size() - 3; i > 0; i-=3) {
 			count.insert(i, 1, ',');
 		}
 		out << count;
 		num_pads = COUNT_COLUMN_WIDTH - count.size();
 		strncpy(temp, space_padding, num_pads);
 		temp[num_pads] = '\0';
 		out << temp;
 		ostringstream strm_tmp;
 		if (use_separation) {
 			if (!tmp_accum.enabled_time) {
 				out << "Event not counted" << endl;
 			} else {
 				strm_tmp.precision(2);
 				strm_tmp << fixed << fraction_time_running * 100
 				         << endl;
 				out << strm_tmp.str();
 			}
 		} else {
 			if (!accum_counts[num].enabled_time) {
 				out << "Event not counted" << endl;
 			} else {
 				strm_tmp.precision(2);
 				strm_tmp << fixed << fraction_time_running * 100
 				         << endl;
 				out << strm_tmp.str();
 			}
 		}
 	}
 }

 void ocount_record::output_results(ostream & out, bool use_separation, bool short_format)
 {
 	size_t longest_event_name = 0;
 	u64 time_enabled = 0ULL;
 	bool scaled = false;

 	for (unsigned long evt_num = 0; evt_num < evts.size(); evt_num++) {
 		if (strlen(evts[evt_num].name) > longest_event_name)
 			longest_event_name = strlen(evts[evt_num].name);
 	}

 	if (with_time_interval) {
 		// reset the accum count values
 		for (size_t i = 0; i < evts.size(); i++) {
 			ocount_accum_t accum = accum_counts[i];
 			accum.count = 0ULL;
 			accum_counts[i] = accum;
 		}
 	}

 	/* We need to inspect all of the count data now to ascertain if scaling
 	 * is required, so we also collect aggregated counts into the accum_counts
 	 * vector (if needed).
 	 */
 	for (unsigned long ocounter = 0; ocounter < perfCounters.size(); ocounter++) {
 		ocount_accum_t tmp_accum;
 		int evt_key = ocounter % evts.size();
 		errno = 0;
 		cverb << vdebug << "Reading counter data for event " << evts[evt_key].name << endl;
 		if (perfCounters[ocounter].read_count_data(&tmp_accum) < 0) {
 			string err_msg = "Internal error: read of perfCounter fd failed with ";
 			err_msg += errno ? strerror(errno) : "unknown error";
 			throw runtime_error(err_msg);
 		}
 		if (!use_separation) {
 			ocount_accum_t real_accum = accum_counts[evt_key];
 			real_accum.count += tmp_accum.count;
 			real_accum.enabled_time += tmp_accum.enabled_time;
 			real_accum.running_time += tmp_accum.running_time;
 			accum_counts[evt_key] = real_accum;
 		}
 		if (tmp_accum.enabled_time != tmp_accum.running_time) {
 			if (((double)(tmp_accum.enabled_time - tmp_accum.running_time)/tmp_accum.enabled_time) > 0.01)
 				scaled = true;
 		}
 	}

 	if (with_time_interval && !use_separation) {
 		for (size_t i = 0; i < evts.size(); i++) {
 			u64 save_prev = prev_accum_counts[i];
 			ocount_accum_t real_accum = accum_counts[i];
 			prev_accum_counts[i] = real_accum.count;
 			real_accum.count -= save_prev;
 			accum_counts[i] = real_accum;
 		}
 	}
 	struct timespec tspec;
 	clock_gettime(CLOCK_MONOTONIC, &tspec);
 	time_enabled = (tspec.tv_sec * 1000000000ULL + tspec.tv_nsec) - start_time;


 	if (short_format)
 		output_short_results(out, use_separation, scaled);
 	else
 		output_long_results(out, use_separation, longest_event_name, scaled, time_enabled);
 }

 int ocount_record::_get_one_process_info(pid_t pid)
 {
 	char fname[PATH_MAX];
 	DIR *tids;
 	struct dirent dirent, *next;
 	int ret = 0;

 	add_process(pid);
 	if (are_tasks_processes()) {
 		snprintf(fname, sizeof(fname), "/proc/%d/task", pid);
 		tids = opendir(fname);
 		if (tids == NULL) {
 			// process must have exited
 			ret = -1;
 			cverb << vdebug << "Process " << pid << " apparently exited while "
 					<< "process info was being collected"<< endl;
 			goto out;
 		}

 		while (!readdir_r(tids, &dirent, &next) && next) {
 			char *end;
 			pid = strtol(dirent.d_name, &end, 10);
 			if (*end)
 				continue;
 			add_process(pid);
 		}
 		closedir(tids);
 	}

 out:
 	return ret;
 }

 /* Obtain process information for one or more active process, where the user has
  * either passed in a set of processes via the --process-list option or has specified
  * --system_wide.
  */
 int ocount_record::get_process_info(const vector<pid_t> & _procs)
 {
 	int ret = 0;
 	if (cverb << vdebug)
 		cout << "op_get_process_info" << endl;
 	for (size_t i = 0; i < _procs.size(); i++) {
 		errno = 0;
 		if (kill(_procs[i], 0) < 0) {
 			if (errno == EPERM) {
 				string errmsg = "You do not have permission to monitor ";
 				errmsg += are_tasks_processes() ? "process " : "thread ";
 				cerr << errmsg << _procs[i] << endl;
 				ret = OP_PERF_HANDLED_ERROR;
 			}
 			break;
 		}
 		if ((ret = _get_one_process_info(_procs[i])) < 0)
 			break;
 	}
 	return ret;
 }
	/**
	* @file ocount_counter.cpp
	* Functions and classes for ocount tool.
	*
	* @remark Copyright 2013 OProfile authors
	* @remark Read the file COPYING
	*
	* Created on: May 22, 2013
	* @author Maynard Johnson
	* (C) Copyright IBM Corp. 2013
	*
	*/
	#include <time.h>
	#include <stdio.h>
	#include <errno.h>
	#include <string.h>
	#include <dirent.h>
	#include <stdlib.h>
	#include <sys/types.h>
	#include <signal.h>

	#include <iostream>
	#include <sstream>
	#include <stdexcept>

	#include "ocount_counter.h"
	#include "op_pe_utils.h"
	#include "operf_event.h"
	#include "cverb.h"

	extern verbose vdebug;
	extern bool use_cpu_minus_one;
	extern char * app_name;

	using namespace std;


	ocount_counter::ocount_counter(operf_event_t & evt, bool enable_on_exec,
	bool inherit)
	{
	memset(&attr, 0, sizeof(attr));
	attr.size = sizeof(attr);
	attr.type = PERF_TYPE_RAW;
	attr.config = evt.evt_code;
	attr.inherit = inherit ? 1 : 0;
	attr.enable_on_exec = enable_on_exec ? 1 : 0;
	attr.disabled = attr.enable_on_exec;
	attr.exclude_idle = 0;
	attr.exclude_kernel = evt.no_kernel;
	attr.exclude_user = evt.no_user;
	attr.exclude_hv = evt.no_hv;
	// This format allows us to tell user percent of time an event was scheduled
	// when multiplexing has been done by the kernel.
	attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED \|
	PERF_FORMAT_TOTAL_TIME_RUNNING;
	event_name = evt.name;
	fd = cpu = pid = -1;
	}

	ocount_counter::~ocount_counter() {
	}

	#include <stdio.h>
	int ocount_counter::perf_event_open(pid_t _pid, int _cpu)
	{
	fd = op_perf_event_open(&attr, _pid, _cpu, -1, 0);
	if (fd < 0) {
	int ret = -1;
	cverb << vdebug << "perf_event_open failed: " << strerror(errno) << endl;
	if (errno == EBUSY) {
	cerr << "The performance monitoring hardware reports EBUSY. Is another profiling tool in use?" << endl
	<< "On some architectures, tools such as oprofile and perf being used in system-wide "
	<< "mode can cause this problem." << endl;
	ret = OP_PERF_HANDLED_ERROR;
	} else if (errno == ESRCH) {
	cerr << "!!!! No samples collected !!!" << endl;
	cerr << "The target program/command ended before profiling was started." << endl;
	ret = OP_PERF_HANDLED_ERROR;
	} else {
	cerr << "perf_event_open failed with " << strerror(errno) << endl;
	}
	return ret;
	}
	pid = _pid;
	cpu = _cpu;

	cverb << vdebug << "perf_event_open returning fd " << fd << endl;
	return fd;
	}

	int ocount_counter::read_count_data(ocount_accum_t * count_data)
	{
	size_t len = 3 * sizeof(u64);
	char * buf = (char *)count_data;

	while (len) {
	int ret = read(fd, buf, len);

	if (ret <= 0)
	return ret;

	len -= ret;
	buf += ret;
	}

	return 0;
	}

	ocount_record::ocount_record(enum op_runmode _runmode, std::vector<operf_event_t> & _evts,
	bool _with_time_interval)
	{
	runmode = _runmode;
	with_time_interval = _with_time_interval;
	evts = _evts;
	valid = false;
	system_wide = false;
	tasks_are_threads = false;
	num_cpus = 0;
	app_pid = -1;
	}

	bool ocount_record::start_counting_app_process(pid_t _pid)
	{
	if (valid) {
	cerr << "ocount internal error: ocount_record already initialized" << endl;
	return false;
	}
	if (runmode != OP_START_APP) {
	cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
	"starting app." << endl;
	return false;
	}
	app_pid = _pid;
	setup();
	return true;
	}

	/*
	* There are separate ocount options for counting events for a set of processes ("--process-list")
	* or a set of threads ("--thread-list"). This function is used for passing the set of either
	* processes or threads to ocount_record, along with a boolean argument to indicate whether or not
	* the set of passed tasks are threads. If they are threads, we set up perf_event_open to NOT
	* do "inherit".
	*/
	bool ocount_record::start_counting_tasklist(std::vector<pid_t> _tasks, bool _are_threads)
	{
	if (valid) {
	cerr << "ocount internal error: ocount_record already initialized" << endl;
	return false;
	}
	tasks_are_threads = _are_threads;
	specified_tasks = _tasks;
	if (tasks_are_threads) {
	if (runmode != OP_THREADLIST) {
	cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
	"--thread-list option." << endl;
	return false;
	}
	} else {
	if (runmode != OP_PROCLIST) {
	cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
	"--process-list option." << endl;
	return false;
	}
	}
	setup();
	if (tasks_to_count.empty()) {
	cerr << "No valid tasks to monitor -- quitting." << endl;
	return false;
	}
	return true;
	}

	bool ocount_record::start_counting_cpulist(std::vector<int> _cpus)
	{
	if (valid) {
	cerr << "ocount internal error: ocount_record already initialized" << endl;
	return false;
	}
	if (runmode != OP_CPULIST) {
	cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
	"--cpu-list option." << endl;
	return false;
	}
	specified_cpus = _cpus;
	setup();
	return true;
	}

	bool ocount_record::start_counting_syswide(void)
	{
	if (valid) {
	cerr << "ocount internal error: ocount_record already initialized" << endl;
	return false;
	}
	if (runmode != OP_SYSWIDE) {
	cerr << "ocount internal error: Current run mode " << runmode << " is incompatible with "
	"--system-wide option." << endl;
	return false;
	}
	system_wide = true;
	setup();
	return true;
	}

	int ocount_record::do_counting_per_task(void)
	{
	string err_msg;
	int rc = 0;

	for (set<pid_t>::iterator it = tasks_to_count.begin(); it != tasks_to_count.end(); it++) {
	pid_t the_pid = *it;
	bool inherit = are_tasks_processes();
	cverb << vdebug << "calling perf_event_open for task " << the_pid << endl;
	for (unsigned event = 0; event < evts.size(); event++) {
	ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
	accum_counts.push_back(count_data);
	prev_accum_counts.push_back(0ULL);
	ocount_counter op_ctr(ocount_counter(evts[event], false, inherit));
	if ((rc = op_ctr.perf_event_open(the_pid, -1)) < 0) {
	err_msg = "Internal Error. Perf event setup failed.";
	goto out;
	} else {
	rc = 0;
	}
	perfCounters.push_back(op_ctr);
	}
	}
	out:
	if (rc && rc != OP_PERF_HANDLED_ERROR)
	throw runtime_error(err_msg);
	return rc;
	}

	int ocount_record::do_counting_per_cpu(void)
	{
	string err_msg;
	int rc = 0;

	/* We'll do this sanity check here, but we also do it at the front-end where user
	* args are being validated. If we wait until we get here, the invalid CPU argument
	* becomes an ugly thrown exception.
	*/
	set<int> available_cpus = op_pe_utils::op_get_available_cpus(num_cpus);
	if (runmode == OP_CPULIST) {
	size_t k;
	for (k = 0; k < specified_cpus.size(); k++) {
	if (available_cpus.find(specified_cpus[k]) == available_cpus.end()) {
	ostringstream err_msg_ostr;
	err_msg_ostr << "Specified CPU " << specified_cpus[k] << " is not valid";
	err_msg = err_msg_ostr.str();
	rc = -1;
	goto out;
	} else {
	cpus_to_count.insert(specified_cpus[k]);
	}
	}
	} else {
	cpus_to_count = available_cpus;
	}

	for (set<pid_t>::iterator it = cpus_to_count.begin(); it != cpus_to_count.end(); it++) {
	int the_cpu = *it;
	cverb << vdebug << "calling perf_event_open for cpu " << the_cpu << endl;
	for (unsigned event = 0; event < evts.size(); event++) {
	ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
	accum_counts.push_back(count_data);
	prev_accum_counts.push_back(0ULL);
	ocount_counter op_ctr(ocount_counter(evts[event], false, true));
	if ((rc = op_ctr.perf_event_open(-1, the_cpu)) < 0) {
	err_msg = "Internal Error. Perf event setup failed.";
	goto out;
	} else {
	rc = 0;
	}
	perfCounters.push_back(op_ctr);
	}
	}
	out:
	if (rc && rc != OP_PERF_HANDLED_ERROR)
	throw runtime_error(err_msg);
	return rc;
	}

	void ocount_record::setup()
	{
	int rc = 0;
	string err_msg;

	if (!specified_tasks.empty()) {
	if ((rc = get_process_info(specified_tasks)) < 0) {
	if (rc == OP_PERF_HANDLED_ERROR)
	return;
	else
	throw runtime_error("Unexpected error in ocount_record setup");
	}
	}

	/* To set up to count events for an existing thread group, we need call perf_event_open
	* for each thread, and we need to pass cpu=-1 on the syscall.
	*/
	use_cpu_minus_one = use_cpu_minus_one ? true : ((system_wide \|\| (runmode == OP_CPULIST)) ? false : true);
	num_cpus = use_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
	if (num_cpus < 1) {
	char int_str[256];
	sprintf(int_str, "Number of online CPUs is %d; cannot continue", num_cpus);
	throw runtime_error(int_str);
	}
	if (system_wide \|\| (runmode == OP_CPULIST)) {
	rc = do_counting_per_cpu();
	} else if (!specified_tasks.empty()) {
	rc = do_counting_per_task();
	} else {
	cverb << vdebug << "calling perf_event_open for pid " << app_pid << endl;
	for (unsigned event = 0; event < evts.size(); event++) {
	ocount_accum_t count_data = {0ULL, 0ULL, 0ULL};
	accum_counts.push_back(count_data);
	prev_accum_counts.push_back(0ULL);
	ocount_counter op_ctr(ocount_counter(evts[event], true, true));
	if ((rc = op_ctr.perf_event_open(app_pid, -1)) < 0) {
	err_msg = "Internal Error. Perf event setup failed.";
	goto error;
	} else {
	rc = 0;
	}
	perfCounters.push_back(op_ctr);
	}
	}
	if (!rc) {
	cverb << vdebug << "perf counter setup complete" << endl;
	// Set bit to indicate we're set to go.
	valid = true;
	// Now that all events are programmed to start counting, init the start time
	struct timespec tspec;
	clock_gettime(CLOCK_MONOTONIC, &tspec);
	start_time = tspec.tv_sec * 1000000000ULL + tspec.tv_nsec;

	return;
	}

	error:
	if (rc != OP_PERF_HANDLED_ERROR)
	throw runtime_error(err_msg);
	}

	void ocount_record::output_short_results(ostream & out, bool use_separation, bool scaled)
	{
	size_t num_iterations = use_separation ? perfCounters.size() : evts.size();
	out << endl;
	for (size_t num = 0; num < num_iterations; num++) {
	ostringstream count_str;
	ocount_accum_t tmp_accum;
	double fraction_time_running;
	if (use_separation) {
	if (cpus_to_count.size()) {
	out << perfCounters[num].get_cpu();
	} else {
	out << perfCounters[num].get_pid();
	}
	out << "," << perfCounters[num].get_event_name() << ",";

	errno = 0;
	cverb << vdebug << "Reading counter data for event " << perfCounters[num].get_event_name() << endl;
	if (perfCounters[num].read_count_data(&tmp_accum) < 0) {
	string err_msg = "Internal error: read of perfCounter fd failed with ";
	err_msg += errno ? strerror(errno) : "unknown error";
	throw runtime_error(err_msg);
	}
	fraction_time_running = scaled ? (double)tmp_accum.running_time/tmp_accum.enabled_time : 1;
	if (with_time_interval) {
	u64 save_prev = prev_accum_counts[num];
	prev_accum_counts[num] = tmp_accum.count;
	tmp_accum.count -= save_prev;
	}
	u64 scaled_count = tmp_accum.count ? tmp_accum.count/fraction_time_running : 0;
	out << dec << scaled_count << ",";
	} else {
	fraction_time_running = scaled ? (double)accum_counts[num].running_time/accum_counts[num].enabled_time : 1;
	u64 scaled_count = accum_counts[num].count ? accum_counts[num].count/fraction_time_running : 0;
	out << perfCounters[num].get_event_name() << "," << dec << scaled_count << ",";
	}
	ostringstream strm_tmp;
	if (use_separation) {
	if (!tmp_accum.enabled_time) {
	out << 0 << endl;
	} else {
	strm_tmp.precision(2);
	strm_tmp << fixed << fraction_time_running * 100
	<< endl;
	out << strm_tmp.str();
	}
	} else {
	if (!accum_counts[num].enabled_time) {
	out << "Event not counted" << endl;
	} else {
	strm_tmp.precision(2);
	strm_tmp << fixed << fraction_time_running * 100
	<< endl;
	out << strm_tmp.str();
	}
	}
	}
	}

	void ocount_record::output_long_results(ostream & out, bool use_separation,
	int longest_event_name, bool scaled, u64 time_enabled)
	{
	#define COUNT_COLUMN_WIDTH 25
	#define SEPARATION_ELEMENT_COLUMN_WIDTH 10
	char space_padding[64], temp[64];
	char const * cpu, * task, * scaling;
	u64 num_seconds_enabled = time_enabled/1000000000;
	unsigned int num_minutes_enabled = num_seconds_enabled/60;
	cpu = "CPU";
	task = "Task ID";
	scaling = scaled ? "(scaled) " : "(actual) ";

	// We put 8 spaces between the end of the event name and beginning of the second column
	unsigned int begin_second_col = longest_event_name + 8;
	unsigned int num_pads = begin_second_col - strlen("Event");
	memset(space_padding, ' ', 64);
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << endl;
	if (!with_time_interval) {
	ostringstream strm;
	strm << "Events were actively counted for ";
	if (num_minutes_enabled) {
	strm << " ";
	strm << num_minutes_enabled;
	if (num_minutes_enabled > 1)
	strm << " minutes and ";
	else
	strm << " minute and ";
	strm << num_seconds_enabled % 60;
	strm << " seconds.";
	} else {
	if (num_seconds_enabled) {
	// Show 1/10's of seconds
	strm.precision(1);
	strm << fixed << (double)time_enabled/1000000000;
	strm << " seconds.";
	} else {
	// Show full nanoseconds
	strm << time_enabled << " nanoseconds.";
	}
	}
	out << strm.str() << endl;
	}
	out << "Event counts " << scaling;
	if (app_name)
	out << "for " << app_name << ":";
	else if (system_wide)
	out << "for the whole system:";
	else if (!cpus_to_count.empty())
	out << "for the specified CPU(s):";
	else if (tasks_are_threads)
	out << "for the specified thread(s):";
	else
	out << "for the specified process(es):";
	out << endl;

	out << "\tEvent" << temp;
	if (use_separation) {
	if (cpus_to_count.size()) {
	out << cpu;
	num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - strlen(cpu);
	} else {
	out << task;
	num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - strlen(task);

	}
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << temp;
	}
	out << "Count";
	num_pads = COUNT_COLUMN_WIDTH - strlen("Count");
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << temp << "% time counted" << endl;

	/* If counting per-cpu or per-thread, I refer generically to cpu or thread values
	* as "elements of separation". We will have one ocount_counter object per element of
	* separation per event. So if we're counting 2 events for 4 processes (or threads),
	* we'll have 2x4 (8) ocount_counter objects.
	*
	* If 'use_separation' is true, then we need to print individual counts for
	* each element of separation for each event; otherwise, we print aggregated counts
	* for each event.
	*/
	size_t num_iterations = use_separation ? perfCounters.size() : evts.size();
	for (size_t num = 0; num < num_iterations; num++) {
	double fraction_time_running;
	out << "\t" << perfCounters[num].get_event_name();
	num_pads = begin_second_col - perfCounters[num].get_event_name().length();
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << temp;

	ostringstream count_str;
	ocount_accum_t tmp_accum;
	if (use_separation) {
	ostringstream separation_element_str;
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	if (cpus_to_count.size()) {
	separation_element_str << dec << perfCounters[num].get_cpu();
	out << perfCounters[num].get_cpu();
	} else {
	separation_element_str << dec << perfCounters[num].get_pid();
	out << perfCounters[num].get_pid();
	}
	num_pads = SEPARATION_ELEMENT_COLUMN_WIDTH - separation_element_str.str().length();
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << temp;

	errno = 0;
	cverb << vdebug << "Reading counter data for event " << perfCounters[num].get_event_name() << endl;
	if (perfCounters[num].read_count_data(&tmp_accum) < 0) {
	string err_msg = "Internal error: read of perfCounter fd failed with ";
	err_msg += errno ? strerror(errno) : "unknown error";
	throw runtime_error(err_msg);
	}
	fraction_time_running = scaled ? (double)tmp_accum.running_time/tmp_accum.enabled_time : 1;

	if (with_time_interval) {
	u64 save_prev = prev_accum_counts[num];
	prev_accum_counts[num] = tmp_accum.count;
	tmp_accum.count -= save_prev;
	}
	u64 scaled_count = tmp_accum.count ? tmp_accum.count/fraction_time_running : 0;
	count_str << dec << scaled_count;
	} else {
	fraction_time_running = scaled ? (double)accum_counts[num].running_time/accum_counts[num].enabled_time : 1;
	u64 scaled_count = accum_counts[num].count ? accum_counts[num].count/fraction_time_running : 0;
	count_str << dec << scaled_count;
	}
	string count = count_str.str();
	for (int i = count.size() - 3; i > 0; i-=3) {
	count.insert(i, 1, ',');
	}
	out << count;
	num_pads = COUNT_COLUMN_WIDTH - count.size();
	strncpy(temp, space_padding, num_pads);
	temp[num_pads] = '\0';
	out << temp;
	ostringstream strm_tmp;
	if (use_separation) {
	if (!tmp_accum.enabled_time) {
	out << "Event not counted" << endl;
	} else {
	strm_tmp.precision(2);
	strm_tmp << fixed << fraction_time_running * 100
	<< endl;
	out << strm_tmp.str();
	}
	} else {
	if (!accum_counts[num].enabled_time) {
	out << "Event not counted" << endl;
	} else {
	strm_tmp.precision(2);
	strm_tmp << fixed << fraction_time_running * 100
	<< endl;
	out << strm_tmp.str();
	}
	}
	}
	}

	void ocount_record::output_results(ostream & out, bool use_separation, bool short_format)
	{
	size_t longest_event_name = 0;
	u64 time_enabled = 0ULL;
	bool scaled = false;

	for (unsigned long evt_num = 0; evt_num < evts.size(); evt_num++) {
	if (strlen(evts[evt_num].name) > longest_event_name)
	longest_event_name = strlen(evts[evt_num].name);
	}

	if (with_time_interval) {
	// reset the accum count values
	for (size_t i = 0; i < evts.size(); i++) {
	ocount_accum_t accum = accum_counts[i];
	accum.count = 0ULL;
	accum_counts[i] = accum;
	}
	}

	/* We need to inspect all of the count data now to ascertain if scaling
	* is required, so we also collect aggregated counts into the accum_counts
	* vector (if needed).
	*/
	for (unsigned long ocounter = 0; ocounter < perfCounters.size(); ocounter++) {
	ocount_accum_t tmp_accum;
	int evt_key = ocounter % evts.size();
	errno = 0;
	cverb << vdebug << "Reading counter data for event " << evts[evt_key].name << endl;
	if (perfCounters[ocounter].read_count_data(&tmp_accum) < 0) {
	string err_msg = "Internal error: read of perfCounter fd failed with ";
	err_msg += errno ? strerror(errno) : "unknown error";
	throw runtime_error(err_msg);
	}
	if (!use_separation) {
	ocount_accum_t real_accum = accum_counts[evt_key];
	real_accum.count += tmp_accum.count;
	real_accum.enabled_time += tmp_accum.enabled_time;
	real_accum.running_time += tmp_accum.running_time;
	accum_counts[evt_key] = real_accum;
	}
	if (tmp_accum.enabled_time != tmp_accum.running_time) {
	if (((double)(tmp_accum.enabled_time - tmp_accum.running_time)/tmp_accum.enabled_time) > 0.01)
	scaled = true;
	}
	}

	if (with_time_interval && !use_separation) {
	for (size_t i = 0; i < evts.size(); i++) {
	u64 save_prev = prev_accum_counts[i];
	ocount_accum_t real_accum = accum_counts[i];
	prev_accum_counts[i] = real_accum.count;
	real_accum.count -= save_prev;
	accum_counts[i] = real_accum;
	}
	}
	struct timespec tspec;
	clock_gettime(CLOCK_MONOTONIC, &tspec);
	time_enabled = (tspec.tv_sec * 1000000000ULL + tspec.tv_nsec) - start_time;


	if (short_format)
	output_short_results(out, use_separation, scaled);
	else
	output_long_results(out, use_separation, longest_event_name, scaled, time_enabled);
	}

	int ocount_record::_get_one_process_info(pid_t pid)
	{
	char fname[PATH_MAX];
	DIR *tids;
	struct dirent dirent, *next;
	int ret = 0;

	add_process(pid);
	if (are_tasks_processes()) {
	snprintf(fname, sizeof(fname), "/proc/%d/task", pid);
	tids = opendir(fname);
	if (tids == NULL) {
	// process must have exited
	ret = -1;
	cverb << vdebug << "Process " << pid << " apparently exited while "
	<< "process info was being collected"<< endl;
	goto out;
	}

	while (!readdir_r(tids, &dirent, &next) && next) {
	char *end;
	pid = strtol(dirent.d_name, &end, 10);
	if (*end)
	continue;
	add_process(pid);
	}
	closedir(tids);
	}

	out:
	return ret;
	}

	/* Obtain process information for one or more active process, where the user has
	* either passed in a set of processes via the --process-list option or has specified
	* --system_wide.
	*/
	int ocount_record::get_process_info(const vector<pid_t> & _procs)
	{
	int ret = 0;
	if (cverb << vdebug)
	cout << "op_get_process_info" << endl;
	for (size_t i = 0; i < _procs.size(); i++) {
	errno = 0;
	if (kill(_procs[i], 0) < 0) {
	if (errno == EPERM) {
	string errmsg = "You do not have permission to monitor ";
	errmsg += are_tasks_processes() ? "process " : "thread ";
	cerr << errmsg << _procs[i] << endl;
	ret = OP_PERF_HANDLED_ERROR;
	}
	break;
	}
	if ((ret = _get_one_process_info(_procs[i])) < 0)
	break;
	}
	return ret;
	}