blob: 30e64d8573f4565d3aa16a15fddf62b83e987733 [file] [log] [blame]
/**
* @file operf_utils.cpp
* Helper methods for perf_events-based OProfile.
*
* @remark Copyright 2011 OProfile authors
* @remark Read the file COPYING
*
* Created on: Dec 7, 2011
* @author Maynard Johnson
* (C) Copyright IBM Corp. 2011
*
* Modified by Maynard Johnson <maynardj@us.ibm.com>
* (C) Copyright IBM Corporation 2012, 2013
*
*/
#include <errno.h>
#include <dirent.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <cverb.h>
#include <iostream>
#include <sstream>
#include "operf_counter.h"
#include "operf_utils.h"
#ifdef HAVE_LIBPFM
#include <perfmon/pfmlib.h>
#endif
#include "op_types.h"
#include "operf_process_info.h"
#include "file_manip.h"
#include "operf_kernel.h"
#include "operf_sfile.h"
#include "op_fileio.h"
#include "op_libiberty.h"
#include "operf_stats.h"
extern verbose vmisc;
extern volatile bool quit;
extern volatile bool read_quit;
extern operf_read operfRead;
extern int sample_reads;
extern unsigned int pagesize;
extern char * app_name;
extern pid_t app_PID;
extern verbose vrecord;
extern verbose vconvert;
extern void __set_event_throttled(int index);
using namespace std;
map<pid_t, operf_process_info *> process_map;
multimap<string, struct operf_mmap *> all_images_map;
map<u64, struct operf_mmap *> kernel_modules;
struct operf_mmap * kernel_mmap;
bool first_time_processing;
bool throttled;
size_t mmap_size;
size_t pg_sz;
static list<event_t *> unresolved_events;
static struct operf_transient trans;
static bool sfile_init_done;
/* Some architectures (e.g., ppc64) do not use the same event value (code) for oprofile
* and for perf_events. The operf-record process requires event values that perf_events
* understands, but the operf-read process requires oprofile event values. The purpose of
* the following method is to map the operf-record event value to a value that
* opreport can understand.
*/
#if PPC64_ARCH
extern op_cpu cpu_type;
#define NIL_CODE ~0U
#if HAVE_LIBPFM3
static bool _get_codes_for_match(unsigned int pfm_idx, const char name[],
vector<operf_event_t> * evt_vec)
{
unsigned int num_events = evt_vec->size();
int tmp_code, ret;
char evt_name[OP_MAX_EVT_NAME_LEN];
unsigned int events_converted = 0;
for (unsigned int i = 0; i < num_events; i++) {
operf_event_t event = (*evt_vec)[i];
if (event.evt_code != NIL_CODE) {
events_converted++;
continue;
}
memset(evt_name, 0, OP_MAX_EVT_NAME_LEN);
if (!strcmp(event.name, "CYCLES")) {
strcpy(evt_name ,"PM_CYC") ;
} else if (strstr(event.name, "_GRP")) {
string str = event.name;
strncpy(evt_name, event.name, str.rfind("_GRP"));
} else {
strncpy(evt_name, event.name, strlen(event.name));
}
if (strncmp(name, evt_name, OP_MAX_EVT_NAME_LEN))
continue;
ret = pfm_get_event_code(pfm_idx, &tmp_code);
if (ret != PFMLIB_SUCCESS) {
string evt_name_str = event.name;
string msg = "libpfm cannot find event code for " + evt_name_str +
"; cannot continue";
throw runtime_error(msg);
}
event.evt_code = tmp_code;
(*evt_vec)[i] = event;
events_converted++;
cverb << vrecord << "Successfully converted " << event.name << " to perf_event code "
<< hex << tmp_code << endl;
}
return (events_converted == num_events);
}
#else
static bool _op_get_event_codes(vector<operf_event_t> * evt_vec)
{
int ret, i;
unsigned int num_events = evt_vec->size();
char evt_name[OP_MAX_EVT_NAME_LEN];
unsigned int events_converted = 0;
uint64_t code[1];
typedef struct {
uint64_t *codes;
char **fstr;
size_t size;
int count;
int idx;
} pfm_raw_pmu_encode_t;
pfm_raw_pmu_encode_t raw;
raw.codes = code;
raw.count = 1;
raw.fstr = NULL;
if (pfm_initialize() != PFM_SUCCESS)
throw runtime_error("Unable to initialize libpfm; cannot continue");
for (unsigned int i = 0; i < num_events; i++) {
operf_event_t event = (*evt_vec)[i];
if (event.evt_code != NIL_CODE) {
events_converted++;
continue;
}
memset(evt_name, 0, OP_MAX_EVT_NAME_LEN);
if (!strcmp(event.name, "CYCLES")) {
strcpy(evt_name ,"PM_CYC") ;
} else if (strstr(event.name, "_GRP")) {
string str = event.name;
strncpy(evt_name, event.name, str.rfind("_GRP"));
} else {
strncpy(evt_name, event.name, strlen(event.name));
}
memset(&raw, 0, sizeof(raw));
ret = pfm_get_os_event_encoding(evt_name, PFM_PLM3, PFM_OS_NONE, &raw);
if (ret != PFM_SUCCESS) {
string evt_name_str = event.name;
string msg = "libpfm cannot find event code for " + evt_name_str +
"; cannot continue";
throw runtime_error(msg);
}
event.evt_code = raw.codes[0];
(*evt_vec)[i] = event;
events_converted++;
cverb << vrecord << "Successfully converted " << event.name << " to perf_event code "
<< hex << event.evt_code << endl;
}
return (events_converted == num_events);
}
#endif
bool OP_perf_utils::op_convert_event_vals(vector<operf_event_t> * evt_vec)
{
unsigned int i, count;
char name[256];
int ret;
for (unsigned int i = 0; i < evt_vec->size(); i++) {
operf_event_t event = (*evt_vec)[i];
if (cpu_type == CPU_PPC64_POWER7) {
if (!strncmp(event.name, "PM_RUN_CYC", strlen("PM_RUN_CYC"))) {
event.evt_code = 0x600f4;
} else if (!strncmp(event.name, "PM_RUN_INST_CMPL", strlen("PM_RUN_INST_CMPL"))) {
event.evt_code = 0x500fa;
} else {
event.evt_code = NIL_CODE;
}
} else {
event.evt_code = NIL_CODE;
}
(*evt_vec)[i] = event;
}
#if HAVE_LIBPFM3
if (pfm_initialize() != PFMLIB_SUCCESS)
throw runtime_error("Unable to initialize libpfm; cannot continue");
ret = pfm_get_num_events(&count);
if (ret != PFMLIB_SUCCESS)
throw runtime_error("Unable to use libpfm to obtain event code; cannot continue");
for(i =0 ; i < count; i++)
{
ret = pfm_get_event_name(i, name, 256);
if (ret != PFMLIB_SUCCESS)
continue;
if (_get_codes_for_match(i, name, evt_vec))
break;
}
return (i != count);
#else
return _op_get_event_codes(evt_vec);
#endif
}
#endif // PPC64_ARCH
static inline void update_trans_last(struct operf_transient * trans)
{
trans->last = trans->current;
trans->last_pc = trans->pc;
}
static inline void clear_trans(struct operf_transient * trans)
{
trans->tgid = ~0U;
trans->cur_procinfo = NULL;
}
static void __handle_fork_event(event_t * event)
{
if (cverb << vconvert)
cout << "PERF_RECORD_FORK for tgid/tid = " << event->fork.pid
<< "/" << event->fork.tid << "; parent " << event->fork.ppid
<< "/" << event->fork.ptid << endl;
map<pid_t, operf_process_info *>::iterator it;
operf_process_info * parent = NULL;
operf_process_info * forked_proc = NULL;
it = process_map.find(event->fork.ppid);
if (it != process_map.end()) {
parent = it->second;
} else {
// Create a new proc info object for the parent, but mark it invalid since we have
// not yet received a COMM event for this PID.
parent = new operf_process_info(event->fork.ppid, app_name ? app_name : NULL,
app_name != NULL, false);
if (cverb << vconvert)
cout << "Adding new proc info to collection for parent PID "
<< event->fork.ppid << endl;
process_map[event->fork.ppid] = parent;
}
/* If the forked process's pid is the same as the parent's, we simply ignore the FORK
* event. This is because operf_process_info objects are stored in the map collection
* by pid, meaning that the forked process and its parent reference the same
* operf_process_info object.
*/
if (event->fork.pid == event->fork.ppid)
return;
it = process_map.find(event->fork.pid);
if (it == process_map.end()) {
forked_proc = new operf_process_info(event->fork.pid, NULL, false, false);
if (cverb << vconvert)
cout << "Adding new proc info to collection for forked PID "
<< event->fork.pid << endl;
process_map[event->fork.pid] = forked_proc;
forked_proc->set_fork_info(parent);
} else {
/*
* Normally, if parent process A forks child process B which then does an exec, we
* first see a FORK event, followed by a COMM event. In this case, the
* operf_process_info created for the forked process is marked as valid. But there's
* no guarantee what order these events may be seen by userspace -- we could easily
* get MMAP, FORK, and finally a COMM event, which is opposite of "expected". So we
* must handle this.
*
* For a valid operf_process_info, if the forked process pid is unique from that of
* the parent, it implies a COMM event was already received for this forked process.
* Such processes are treated as standalone processes, so we ignore the FORK event.
* For all other cases, if the forked process has not already been associated with
* its parent (i.e., !is_forked()), we go ahead and set that association.
*/
forked_proc = it->second;
if (forked_proc->is_valid()) {
// Ignore the FORK event
if (cverb << vconvert)
cout << "Forked proc " << event->fork.pid
<< " is currently valid (i.e., PERF_RECORD_COMM already received),"
<< " so is independent from parent "
<< event->fork.ppid << endl;
return;
}
if (!forked_proc->is_forked()) {
forked_proc->set_fork_info(parent);
if (cverb << vconvert)
cout << "Set fork info for PID " << event->fork.pid
<< " with parent " << event->fork.ppid << endl;
}
}
}
static void __handle_comm_event(event_t * event)
{
if (cverb << vconvert)
cout << "PERF_RECORD_COMM for " << event->comm.comm << ", tgid/tid = "
<< event->comm.pid << "/" << event->comm.tid << endl;
map<pid_t, operf_process_info *>::iterator it;
it = process_map.find(event->comm.pid);
if (it == process_map.end()) {
/* A COMM event can occur as the result of the app doing a fork/exec,
* where the COMM event is for the forked process. In that case, we
* pass the event->comm field as the appname argument to the ctor.
*/
const char * appname_arg;
bool is_complete_appname;
if (app_name && (app_PID == event->comm.pid)) {
appname_arg = app_name;
is_complete_appname = true;
} else {
appname_arg = event->comm.comm;
is_complete_appname = false;
}
/* If tid != pid, this may be a forked process for which we've not yet received
* the PERF_RECORD_FORK event, nor have we received any other events for the
* process (e.g., COMM event for parent). We mark such proc infos as "invalid" so we
* don't falsely attribute samples to a child thread which should, instead,
* be attributed to its parent. If this is indeed a forked process, we should
* eventually receive a COMM event for the parent (where tid==pid), at which time,
* we'll mark the proc info valid. If we never receive a COMM event for a parent,
* the proc info will get marked valid during reprocessing so we can attribute
* deferred samples at that time.
*/
bool valid_bit = (event->comm.pid == event->comm.tid);
operf_process_info * proc = new operf_process_info(event->comm.pid, appname_arg,
is_complete_appname, valid_bit);
if (cverb << vconvert)
cout << "Adding new proc info to collection for PID " << event->comm.pid << endl;
process_map[event->comm.pid] = proc;
} else {
/* If we reach this point, it means a proc info object for this pid already exists;
* however, if it was created by something other than a "valid" COMM event (e.g., MMAP event),
* its 'valid' bit will be set to false. NOTE: A "valid" COMM event is one in which
* tid==pid.
*
* We must handle the following situations:
* o If valid:
* - Existing proc info created for a parent (i.e., tid == pid), and the current
* COMM event is for a child -- and we ignore all child COMM events.
* - Existing proc info may have invalid appname, so we call set_appname()
* and see if this COMM event has an appropriate appname.
*
* o If not valid:
* - Existing proc info was created for the parent by an MMAP type of event, and the
* current COMM event is for the parent.
* - Existing proc info was created by FORK; now that we have a COMM event for it,
* the process should be treated as a standalone process, so we call
* try_disassociate_from_parent().
*/
if (!it->second->is_valid()) {
// Ignore child COMM events (i.e., pid != tid).
if (event->comm.pid == event->comm.tid) {
if (it->second->is_forked()) {
it->second->try_disassociate_from_parent(event->comm.comm);
} else {
// Existing proc info created by MMAP event or some such
it->second->set_valid();
it->second->set_appname(event->comm.comm, false);
}
}
} else {
if ((event->comm.pid == event->comm.tid) && !it->second->is_appname_valid()) {
it->second->set_appname(event->comm.comm, false);
}
}
}
}
static void __handle_mmap_event(event_t * event)
{
static bool kptr_restrict_warning_displayed_already = false;
string image_basename = op_basename(event->mmap.filename);
struct operf_mmap * mapping = NULL;
multimap<string, struct operf_mmap *>::iterator it;
pair<multimap<string, struct operf_mmap *>::iterator,
multimap<string, struct operf_mmap *>::iterator> range;
range = all_images_map.equal_range(image_basename);
for (it = range.first; it != range.second; it++) {
if (((strcmp((*it).second->filename, image_basename.c_str())) == 0)
&& ((*it).second->start_addr == event->mmap.start)) {
mapping = (*it).second;
break;
}
}
if (!mapping) {
mapping = new struct operf_mmap;
memset(mapping, 0, sizeof(struct operf_mmap));
mapping->start_addr = event->mmap.start;
strcpy(mapping->filename, event->mmap.filename);
/* Mappings starting with "/" are for either a file or shared memory object.
* From the kernel's perf_events subsystem, anon maps have labels like:
* [heap], [stack], [vdso], //anon
*/
if (mapping->filename[0] == '[') {
mapping->is_anon_mapping = true;
} else if ((strncmp(mapping->filename, "//anon",
strlen("//anon")) == 0)) {
mapping->is_anon_mapping = true;
strcpy(mapping->filename, "anon");
}
mapping->end_addr = (event->mmap.len == 0ULL)? 0ULL : mapping->start_addr + event->mmap.len - 1;
mapping->pgoff = event->mmap.pgoff;
if (cverb << vconvert) {
cout << "PERF_RECORD_MMAP for process " << hex << event->mmap.pid << "/"
<< event->mmap.tid << ": " << event->mmap.filename << endl;
cout << "\tstart_addr: " << hex << mapping->start_addr
<< "; end addr: " << mapping->end_addr << endl;
}
if (event->header.misc & PERF_RECORD_MISC_USER)
all_images_map.insert(pair<string, struct operf_mmap *>(image_basename, mapping));
}
if (event->header.misc & PERF_RECORD_MISC_KERNEL) {
if (!strncmp(mapping->filename, operf_get_vmlinux_name(),
strlen(mapping->filename))) {
/* The kernel_mmap is just a convenience variable
* for use when mapping samples to kernel space, since
* most of the kernel samples will be attributable to
* the vmlinux file versus kernel modules.
*/
kernel_mmap = mapping;
} else {
if ((kptr_restrict == 1) && !no_vmlinux && (my_uid != 0)) {
if (!kptr_restrict_warning_displayed_already) {
kptr_restrict_warning_displayed_already = true;
cerr << endl << "< < < WARNING > > >" << endl;
cerr << "Samples for vmlinux kernel will be recorded, but kernel module profiling"
<< endl << "is not possible with current system config." << endl;
cerr << "Set /proc/sys/kernel/kptr_restrict to 0 to see samples for kernel modules."
<< endl << "< < < < < > > > > >" << endl << endl;
}
} else {
operf_create_module(mapping->filename,
mapping->start_addr,
mapping->end_addr);
kernel_modules[mapping->start_addr] = mapping;
}
}
} else {
map<pid_t, operf_process_info *>::iterator it;
it = process_map.find(event->mmap.pid);
if (it == process_map.end()) {
/* Create a new proc info object, but mark it invalid since we have
* not yet received a COMM event for this PID. This MMAP event may
* be on behalf of a process created as a result of a fork/exec.
* The order of delivery of events is not guaranteed so we may see
* this MMAP event before getting the COMM event for that process.
* If this is the case here, we just pass NULL for appname arg.
* It will get fixed up later when the COMM event occurs.
*/
const char * appname_arg;
bool is_complete_appname;
if (app_name && (app_PID == event->mmap.pid)) {
appname_arg = app_name;
is_complete_appname = true;
} else {
appname_arg = NULL;
is_complete_appname = false;
}
operf_process_info * proc = new operf_process_info(event->mmap.pid, appname_arg,
is_complete_appname, false);
process_map[event->mmap.pid] = proc;
proc->process_mapping(mapping, false);
} else {
it->second->process_mapping(mapping, false);
}
if (cverb << vconvert)
cout << "Process mapping for " << event->mmap.filename << " on behalf of "
<< event->mmap.pid << endl;
}
}
static struct operf_transient * __get_operf_trans(struct sample_data * data, bool hypervisor_domain,
bool kernel_mode)
{
operf_process_info * proc = NULL;
const struct operf_mmap * op_mmap = NULL;
struct operf_transient * retval = NULL;
if (trans.tgid == data->pid) {
proc = trans.cur_procinfo;
if (cverb << vconvert)
cout << "trans.tgid == data->pid : " << data->pid << endl;
} else {
// Find operf_process info for data.tgid.
std::map<pid_t, operf_process_info *>::const_iterator it = process_map.find(data->pid);
if (it != process_map.end() && it->second->is_appname_valid()) {
proc = it->second;
} else {
// This can validly happen if get a sample before getting a COMM event for the process
if ((cverb << vconvert) && !first_time_processing) {
cout << "Dropping sample -- process info unavailable for PID " << data->pid << endl;
if (kernel_mode)
operf_stats[OPERF_NO_APP_KERNEL_SAMPLE]++;
else
operf_stats[OPERF_NO_APP_USER_SAMPLE]++;
}
goto out;
}
}
// Now find mmapping that contains the data.ip address.
// Use that mmapping to set fields in trans.
if (kernel_mode) {
if (data->ip >= kernel_mmap->start_addr &&
data->ip <= kernel_mmap->end_addr) {
op_mmap = kernel_mmap;
} else {
map<u64, struct operf_mmap *>::iterator it;
it = kernel_modules.begin();
while (it != kernel_modules.end()) {
if (data->ip >= it->second->start_addr &&
data->ip <= it->second->end_addr) {
op_mmap = it->second;
break;
}
it++;
}
} if (!op_mmap) {
if ((kernel_mmap->start_addr == 0ULL) &&
(kernel_mmap->end_addr == 0ULL))
op_mmap = kernel_mmap;
}
if (!op_mmap) {
/* This can happen if a kernel module is loaded after profiling
* starts, and then we get samples for that kernel module.
* TODO: Fix this.
*/
}
} else {
op_mmap = proc->find_mapping_for_sample(data->ip);
if (op_mmap && op_mmap->is_hypervisor && !hypervisor_domain) {
cverb << vconvert << "Invalid sample: Address falls within hypervisor address range, but is not a hypervisor domain sample." << endl;
operf_stats[OPERF_INVALID_CTX]++;
op_mmap = NULL;
}
}
if (op_mmap) {
if (cverb << vconvert)
cout << "Found mmap for sample; image_name is " << op_mmap->filename <<
" and app name is " << proc->get_app_name() << endl;
trans.image_name = op_mmap->filename;
trans.app_len = proc->get_app_name().size();
strncpy(trans.app_filename, proc->get_app_name().c_str(), trans.app_len);
trans.app_filename[trans.app_len] = '\0';
trans.image_len = strlen(trans.image_name);
trans.start_addr = op_mmap->start_addr;
trans.end_addr = op_mmap->end_addr;
trans.tgid = data->pid;
trans.tid = data->tid;
trans.cur_procinfo = proc;
trans.cpu = data->cpu;
trans.is_anon = op_mmap->is_anon_mapping;
trans.in_kernel = kernel_mode;
if (trans.in_kernel || trans.is_anon)
trans.pc = data->ip;
else
trans.pc = data->ip - trans.start_addr;
trans.sample_id = data->id;
retval = &trans;
} else {
if ((cverb << vconvert) && !first_time_processing) {
string domain = trans.in_kernel ? "kernel" : "userspace";
ostringstream message;
message << "Discarding " << domain << " sample for process " << data->pid
<< " where no appropriate mapping was found. (pc=0x"
<< hex << data->ip <<")" << endl;
cout << message.str();
operf_stats[OPERF_LOST_NO_MAPPING]++;
}
retval = NULL;
}
out:
return retval;
}
static void __handle_callchain(u64 * array, struct sample_data * data)
{
bool in_kernel = false;
data->callchain = (struct ip_callchain *) array;
if (data->callchain->nr) {
if (cverb << vconvert)
cout << "Processing callchain" << endl;
for (int i = 0; i < data->callchain->nr; i++) {
data->ip = data->callchain->ips[i];
if (data->ip >= PERF_CONTEXT_MAX) {
switch (data->ip) {
case PERF_CONTEXT_HV:
// hypervisor samples are not supported for callgraph
// TODO: log lost callgraph arc
break;
case PERF_CONTEXT_KERNEL:
in_kernel = true;
break;
case PERF_CONTEXT_USER:
in_kernel = false;
break;
default:
break;
}
continue;
}
if (data->ip && __get_operf_trans(data, false, in_kernel)) {
if ((trans.current = operf_sfile_find(&trans))) {
operf_sfile_log_arc(&trans);
update_trans_last(&trans);
}
} else {
if (data->ip)
operf_stats[OPERF_BT_LOST_NO_MAPPING]++;
}
}
}
}
static void __map_hypervisor_sample(u64 ip, u32 pid)
{
operf_process_info * proc;
map<pid_t, operf_process_info *>::iterator it;
it = process_map.find(pid);
if (it == process_map.end()) {
/* Create a new proc info object, but mark it invalid since we have
* not yet received a COMM event for this PID. This sample may be
* on behalf of a process created as a result of a fork/exec.
* The order of delivery of events is not guaranteed so we may see
* this sample event before getting the COMM event for that process.
* If this is the case here, we just pass NULL for appname arg.
* It will get fixed up later when the COMM event occurs.
*/
const char * appname_arg;
bool is_complete_appname;
if (app_name && (app_PID == pid)) {
appname_arg = app_name;
is_complete_appname = true;
} else {
appname_arg = NULL;
is_complete_appname = false;
}
proc = new operf_process_info(pid, appname_arg,
is_complete_appname, false);
if (cverb << vconvert)
cout << "Adding new proc info to collection for PID " << pid << endl;
process_map[pid] = proc;
} else {
proc = it->second;
}
proc->process_hypervisor_mapping(ip);
}
static int __handle_throttle_event(event_t * event, u64 sample_type)
{
int rc = 0;
trans.event = operfRead.get_eventnum_by_perf_event_id(event->throttle.id);
if (trans.event >= 0)
__set_event_throttled(trans.event);
else
rc = -1;
return rc;
}
static int __handle_sample_event(event_t * event, u64 sample_type)
{
struct sample_data data;
bool found_trans = false;
bool in_kernel;
int rc = 0;
const struct operf_mmap * op_mmap = NULL;
bool hypervisor = (event->header.misc == PERF_RECORD_MISC_HYPERVISOR);
u64 *array = event->sample.array;
/* As we extract the various pieces of information from the sample data array,
* if we find that the sample type does not match up with an expected mandatory
* perf_event_sample_format, we consider this as corruption of the sample data
* stream. Since it wouldn't make sense to continue with suspect data, we quit.
*/
if (sample_type & PERF_SAMPLE_IP) {
data.ip = event->ip.ip;
array++;
} else {
rc = -1;
goto done;
}
if (sample_type & PERF_SAMPLE_TID) {
u_int32_t *p = (u_int32_t *)array;
data.pid = p[0];
data.tid = p[1];
array++;
} else {
rc = -1;
goto done;
}
data.id = ~0ULL;
if (sample_type & PERF_SAMPLE_ID) {
data.id = *array;
array++;
} else {
rc = -1;
goto done;
}
// PERF_SAMPLE_CPU is optional (see --separate-cpu).
if (sample_type & PERF_SAMPLE_CPU) {
u_int32_t *p = (u_int32_t *)array;
data.cpu = *p;
array++;
}
if (event->header.misc == PERF_RECORD_MISC_KERNEL) {
in_kernel = true;
} else if (event->header.misc == PERF_RECORD_MISC_USER) {
in_kernel = false;
}
#if PPC64_ARCH
else if (event->header.misc == PERF_RECORD_MISC_HYPERVISOR) {
#define MAX_HYPERVISOR_ADDRESS 0xfffffffULL
if (data.ip > MAX_HYPERVISOR_ADDRESS) {
cverb << vconvert << "Discarding out-of-range hypervisor sample: "
<< hex << data.ip << endl;
operf_stats[OPERF_LOST_INVALID_HYPERV_ADDR]++;
goto out;
}
in_kernel = false;
if (first_time_processing) {
__map_hypervisor_sample(data.ip, data.pid);
}
}
#endif
else {
// TODO: Unhandled types are the guest kernel and guest user samples.
// We should at least log what we're throwing away.
if (cverb << vconvert) {
const char * domain;
switch (event->header.misc) {
case PERF_RECORD_MISC_HYPERVISOR:
domain = "hypervisor";
break;
#if HAVE_PERF_GUEST_MACROS
case PERF_RECORD_MISC_GUEST_KERNEL:
domain = "guest OS";
break;
case PERF_RECORD_MISC_GUEST_USER:
domain = "guest user";
break;
#endif
default:
domain = "unknown";
break;
}
ostringstream message;
message << "Discarding sample from " << domain << " domain: "
<< hex << data.ip << endl;
cout << message.str();
}
goto out;
}
/* If the static variable trans.tgid is still holding its initial value of 0,
* then we would incorrectly find trans.tgid and data.pid matching, and
* and make wrong assumptions from that match -- ending seg fault. So we
* will bail out early if we see a sample for PID 0 coming in and trans.image_name
* is NULL (implying the trans object is still in its initial state).
*/
if (!trans.image_name && (data.pid == 0)) {
cverb << vconvert << "Discarding sample for PID 0" << endl;
goto out;
}
if (cverb << vconvert) {
ostringstream message;
message << "(IP, " << event->header.misc << "): " << dec << data.pid << "/"
<< data.tid << ": " << hex << (unsigned long long)data.ip
<< endl << "\tdata ID: " << data.id << endl;
cout << message.str();
}
// Verify the sample.
if (data.id != trans.sample_id) {
trans.event = operfRead.get_eventnum_by_perf_event_id(data.id);
if (trans.event < 0) {
cerr << "Event num " << trans.event << " for id " << data.id
<< " is invalid. Sample data appears to be corrupted." << endl;
rc = -1;
goto out;
}
}
/* Only need to check for "no_user" since "no_kernel" is done by
* perf_events code.
*/
if ((operfRead.get_event_by_counter(trans.event)->no_user) &&
(event->header.misc == PERF_RECORD_MISC_USER)) {
// Dropping user domain sample by user request in event spec.
goto out;
}
if ((event->header.misc == PERF_RECORD_MISC_HYPERVISOR) && first_time_processing) {
/* We defer processing hypervisor samples until all the samples
* are processed. We do this because we synthesize an mmapping
* for hypervisor samples and need to modify it (start_addr and/or
* end_addr) as new hypervisor samples arrive. If we completely
* processed the hypervisor samples during "first_time_processing",
* we would end up (usually) with multiple "[hypervisor_bucket]" sample files,
* each with a unique address range. So we'll stick the event on
* the unresolved_events list to be re-processed later.
*/
event_t * ev = (event_t *)xmalloc(event->header.size);
memcpy(ev, event, event->header.size);
unresolved_events.push_back(ev);
if (cverb << vconvert)
cout << "Deferring processing of hypervisor sample." << endl;
goto out;
}
/* Check for the common case first -- i.e., where the current sample is from
* the same context as the previous sample. For the "no-vmlinux" case, start_addr
* and end_addr will be zero, so need to make sure we detect that.
* The last resort (and most expensive) is to call __get_operf_trans() if the
* sample cannot be matched up with a previous tran object.
*/
if (in_kernel) {
if (trans.image_name && trans.tgid == data.pid) {
// For the no-vmlinux case . . .
if ((trans.start_addr == 0ULL) && (trans.end_addr == 0ULL)) {
trans.pc = data.ip;
found_trans = true;
// For samples in vmlinux or kernel module
} else if (data.ip >= trans.start_addr && data.ip <= trans.end_addr) {
trans.pc = data.ip;
found_trans = true;
}
}
} else if (trans.tgid == data.pid && data.ip >= trans.start_addr && data.ip <= trans.end_addr) {
trans.tid = data.tid;
if (trans.is_anon)
trans.pc = data.ip;
else
trans.pc = data.ip - trans.start_addr;
found_trans = true;
}
if (!found_trans && __get_operf_trans(&data, hypervisor, in_kernel)) {
trans.current = operf_sfile_find(&trans);
found_trans = true;
}
/*
* trans.current may be NULL if a kernel sample falls through
* the cracks, or if it's a sample from an anon region we couldn't find
*/
if (found_trans && trans.current) {
/* log the sample or arc */
operf_sfile_log_sample(&trans);
update_trans_last(&trans);
if (sample_type & PERF_SAMPLE_CALLCHAIN)
__handle_callchain(array, &data);
goto done;
}
if (first_time_processing) {
event_t * ev = (event_t *)malloc(event->header.size);
memcpy(ev, event, event->header.size);
unresolved_events.push_back(ev);
}
out:
clear_trans(&trans);
done:
return rc;
}
/* This function is used by operf_read::convertPerfData() to convert perf-formatted
* data to oprofile sample data files. After the header information in the perf sample data,
* the next piece of data is typically the PERF_RECORD_COMM record which tells us the name of the
* application/command being profiled. This is followed by PERF_RECORD_MMAP records
* which indicate what binary executables and libraries were mmap'ed into process memory
* when profiling began. Additional PERF_RECORD_MMAP records may appear later in the data
* stream (e.g., dlopen for single-process profiling or new process startup for system-wide
* profiling.
*
* This function returns '0' on success and '-1' on failure. A failure implies the sample
* data is probably corrupt and the calling function should handle appropriately.
*/
int OP_perf_utils::op_write_event(event_t * event, u64 sample_type)
{
#if 0
if (event->header.type < PERF_RECORD_MAX) {
cverb << vconvert << "PERF_RECORD type " << hex << event->header.type << endl;
}
#endif
switch (event->header.type) {
case PERF_RECORD_SAMPLE:
return __handle_sample_event(event, sample_type);
case PERF_RECORD_MMAP:
__handle_mmap_event(event);
return 0;
case PERF_RECORD_COMM:
if (!sfile_init_done) {
operf_sfile_init();
sfile_init_done = true;
}
__handle_comm_event(event);
return 0;
case PERF_RECORD_FORK:
__handle_fork_event(event);
return 0;
case PERF_RECORD_THROTTLE:
return __handle_throttle_event(event, sample_type);
case PERF_RECORD_LOST:
operf_stats[OPERF_RECORD_LOST_SAMPLE] += event->lost.lost;
return 0;
case PERF_RECORD_EXIT:
return 0;
default:
if (event->header.type > PERF_RECORD_MAX) {
// Bad header
ostringstream message;
message << "Invalid event type " << hex << event->header.type << endl;
message << "Sample data is probably corrupted." << endl;
cerr << message.str();
return -1;
} else {
ostringstream message;
message << "Event type "<< hex << event->header.type
<< " is ignored." << endl;
cverb << vconvert << message.str();
return 0;
}
}
}
void OP_perf_utils::op_reprocess_unresolved_events(u64 sample_type, bool print_progress)
{
int num_recs = 0;
cverb << vconvert << "Reprocessing samples" << endl;
map<pid_t, operf_process_info *>::iterator procs = process_map.begin();
for (; procs != process_map.end(); procs++) {
if (!procs->second->is_valid()) {
if (procs->second->is_forked()) {
procs->second->connect_forked_process_to_parent();
} else {
procs->second->set_valid();
}
}
// Force the appname_valid to true so we don't drop any samples for this process.
// The appname may not be accurate, but it's the best we can do now.
procs->second->set_appname_valid();
}
list<event_t *>::const_iterator it = unresolved_events.begin();
int data_error = 0;
for (; it != unresolved_events.end(); it++) {
event_t * evt = (*it);
if (data_error < 0) {
free(evt);
continue;
}
// This is just a sanity check, since all events in this list
// are unresolved sample events.
if (evt->header.type == PERF_RECORD_SAMPLE) {
data_error = __handle_sample_event(evt, sample_type);
free(evt);
num_recs++;
if ((num_recs % 1000000 == 0) && print_progress)
cerr << ".";
}
}
}
void OP_perf_utils::op_release_resources(void)
{
map<pid_t, operf_process_info *>::iterator it = process_map.begin();
while (it != process_map.end())
delete it++->second;
process_map.clear();
multimap<string, struct operf_mmap *>::iterator images_it = all_images_map.begin();
while (images_it != all_images_map.end())
delete images_it++->second;
all_images_map.clear();
delete kernel_mmap;
operf_sfile_close_files();
operf_free_modules_list();
}
void OP_perf_utils::op_perfrecord_sigusr1_handler(int sig __attribute__((unused)),
siginfo_t * siginfo __attribute__((unused)),
void *u_context __attribute__((unused)))
{
quit = true;
}
int OP_perf_utils::op_read_from_stream(ifstream & is, char * buf, streamsize sz)
{
int rc = 0;
is.read(buf, sz);
if (!is.eof() && is.fail()) {
cerr << "Internal error: Failed to read from input file." << endl;
rc = -1;
} else {
rc = is.gcount();
}
return rc;
}
static int __mmap_trace_file(struct mmap_info & info)
{
int mmap_prot = PROT_READ;
int mmap_flags = MAP_SHARED;
info.buf = (char *) mmap(NULL, mmap_size, mmap_prot,
mmap_flags, info.traceFD, info.offset);
if (info.buf == MAP_FAILED) {
ostringstream message;
message << "Error: mmap failed with errno:\n\t" << strerror(errno) << endl;
message << "\tmmap_size: 0x" << hex << mmap_size << "; offset: 0x" << info.offset << endl;
cerr << message.str();
return -1;
}
else {
ostringstream message;
message << hex << "mmap with the following parameters" << endl
<< "\tinfo.head: " << info.head << endl
<< "\tinfo.offset: " << info.offset << endl;
cverb << vconvert << message.str();
return 0;
}
}
int OP_perf_utils::op_mmap_trace_file(struct mmap_info & info, bool init)
{
u64 shift;
if (init) {
if (!mmap_size) {
if (MMAP_WINDOW_SZ > info.file_data_size) {
mmap_size = info.file_data_size;
} else {
mmap_size = MMAP_WINDOW_SZ;
}
}
info.offset = 0;
info.head = info.file_data_offset;
shift = pg_sz * (info.head / pg_sz);
info.offset += shift;
info.head -= shift;
}
return __mmap_trace_file(info);
}
int OP_perf_utils::op_write_output(int output, void *buf, size_t size)
{
int sum = 0;
while (size) {
int ret = write(output, buf, size);
if (ret < 0) {
if (errno == EINTR)
continue;
string errmsg = "Internal error: Failed to write sample data to output fd. errno is ";
errmsg += strerror(errno);
throw runtime_error(errmsg);
}
size -= ret;
buf = (char *)buf + ret;
sum += ret;
}
return sum;
}
void OP_perf_utils::op_record_process_exec_mmaps(pid_t pid, pid_t tgid, int output_fd, operf_record * pr)
{
char fname[PATH_MAX];
FILE *fp;
snprintf(fname, sizeof(fname), "/proc/%d/maps", tgid);
fp = fopen(fname, "r");
if (fp == NULL) {
// Process must have exited already or invalid pid.
cverb << vrecord << "couldn't open " << fname << endl;
return;
}
while (1) {
char line_buffer[BUFSIZ];
char perms[5], pathname[PATH_MAX], dev[16];
unsigned long long start_addr, end_addr, offset;
const char * anon_mem = "//anon";
u_int32_t inode;
memset(pathname, '\0', sizeof(pathname));
struct mmap_event mmap;
size_t size;
memset(&mmap, 0, sizeof(mmap));
mmap.pgoff = 0;
mmap.header.type = PERF_RECORD_MMAP;
mmap.header.misc = PERF_RECORD_MISC_USER;
if (fgets(line_buffer, sizeof(line_buffer), fp) == NULL)
break;
sscanf(line_buffer, "%llx-%llx %s %llx %s %d %s",
&start_addr, &end_addr, perms, &offset, dev, &inode, pathname);
if (perms[2] == 'x') {
char *imagename = strchr(pathname, '/');
if (imagename == NULL)
imagename = strstr(pathname, "[vdso]");
if ((imagename == NULL) && !strstr(pathname, "["))
imagename = (char *)anon_mem;
if (imagename == NULL)
continue;
size = strlen(imagename) + 1;
strcpy(mmap.filename, imagename);
size = align_64bit(size);
mmap.start = start_addr;
mmap.len = end_addr - mmap.start;
mmap.pid = tgid;
mmap.tid = pid;
mmap.header.size = (sizeof(mmap) -
(sizeof(mmap.filename) - size));
int num = OP_perf_utils::op_write_output(output_fd, &mmap, mmap.header.size);
if (cverb << vrecord)
cout << "Created MMAP event for " << imagename << endl;
pr->add_to_total(num);
}
}
fclose(fp);
return;
}
static int _get_one_process_info(bool sys_wide, pid_t pid, operf_record * pr)
{
struct comm_event comm;
char fname[PATH_MAX];
char buff[BUFSIZ];
FILE *fp;
pid_t tgid = 0;
size_t size = 0;
DIR *tids;
struct dirent dirent, *next;
int ret = 0;
snprintf(fname, sizeof(fname), "/proc/%d/status", pid);
fp = fopen(fname, "r");
if (fp == NULL) {
/* Process must have finished or invalid PID passed into us.
* If we're doing system-wide profiling, this case can naturally
* occur, and it's not an error. But if profiling on a single
* application, we can't continue after this, so we'll bail out now.
*/
if (!sys_wide) {
cerr << "Unable to find process information for process " << pid << "." << endl;
cverb << vrecord << "couldn't open " << fname << endl;
return OP_PERF_HANDLED_ERROR;
} else {
return 0;
}
}
memset(&comm, 0, sizeof(comm));
while (!comm.comm[0] || !comm.pid) {
if (fgets(buff, sizeof(buff), fp) == NULL) {
ret = -1;
cverb << vrecord << "Did not find Name or PID field in status file." << endl;
goto out;
}
if (!strncmp(buff, "Name:", 5)) {
char *name = buff + 5;
while (*name && isspace(*name))
++name;
size = strlen(name) - 1;
// The "Name" field in /proc/pid/status currently only allows for 16 characters,
// but I'm not going to count on that being stable. We'll ensure we copy no more
// than 16 chars since the comm.comm char array only holds 16.
size = size > 16 ? 16 : size;
memcpy(comm.comm, name, size++);
} else if (memcmp(buff, "Tgid:", 5) == 0) {
char *tgids = buff + 5;
while (*tgids && isspace(*tgids))
++tgids;
tgid = comm.pid = atoi(tgids);
}
}
comm.header.type = PERF_RECORD_COMM;
size = align_64bit(size);
comm.header.size = sizeof(comm) - (sizeof(comm.comm) - size);
if (tgid != pid) {
// passed pid must have been a secondary thread, and we
// don't go looking at the /proc/<pid>/task of such processes.
comm.tid = pid;
pr->add_process(comm);
goto out;
}
snprintf(fname, sizeof(fname), "/proc/%d/task", pid);
tids = opendir(fname);
if (tids == NULL) {
// process must have exited
ret = -1;
cverb << vrecord << "Process " << pid << " apparently exited while "
<< "process info was being collected"<< endl;
goto out;
}
while (!readdir_r(tids, &dirent, &next) && next) {
char *end;
pid = strtol(dirent.d_name, &end, 10);
if (*end)
continue;
comm.tid = pid;
pr->add_process(comm);
}
closedir(tids);
out:
fclose(fp);
if (ret) {
cverb << vrecord << "couldn't get app name and tgid for pid "
<< dec << pid << " from /proc fs." << endl;
}
return ret;
}
/* Obtain process information for an active process (where the user has
* passed in a process ID via the --pid option) or all active processes
* (where system_wide==true).
*/
int OP_perf_utils::op_get_process_info(bool system_wide, pid_t pid, operf_record * pr)
{
int ret = 0;
if (cverb << vrecord)
cout << "op_get_process_info" << endl;
if (!system_wide) {
ret = _get_one_process_info(system_wide, pid, pr);
} else {
char buff[BUFSIZ];
pid_t tgid = 0;
size_t size = 0;
DIR *pids;
struct dirent dirent, *next;
pids = opendir("/proc");
if (pids == NULL) {
cerr << "Unable to open /proc." << endl;
return -1;
}
while (!readdir_r(pids, &dirent, &next) && next) {
char *end;
pid = strtol(dirent.d_name, &end, 10);
if (((errno == ERANGE && (pid == LONG_MAX || pid == LONG_MIN))
|| (errno != 0 && pid == 0)) || (end == dirent.d_name)) {
cverb << vmisc << "/proc entry " << dirent.d_name << " is not a PID" << endl;
continue;
}
if ((ret = _get_one_process_info(system_wide, pid, pr)) < 0)
break;
}
closedir(pids);
}
return ret;
}
/*
* each line is in the format:
*
* module_name 16480 1 dependencies Live 0xe091e000
*
* without any blank space in each field
*/
static void _record_module_info(int output_fd, operf_record * pr)
{
const char * fname = "/proc/modules";
FILE *fp;
char * line;
struct operf_kernel_image * image;
int module_size;
char ref_count[32+1];
int ret;
char module_name[256+1];
char live_info[32+1];
char dependencies[4096+1];
unsigned long long start_address;
fp = fopen(fname, "r");
if (fp == NULL) {
cerr << "Error opening /proc/modules. Unable to process module samples" << endl;
cerr << strerror(errno) << endl;
return;
}
while (1) {
struct mmap_event mmap;
size_t size;
memset(&mmap, 0, sizeof(mmap));
mmap.pgoff = 0;
line = op_get_line(fp);
if (!line)
break;
if (line[0] == '\0') {
free(line);
continue;
}
ret = sscanf(line, "%256s %u %32s %4096s %32s %llx",
module_name, &module_size, ref_count,
dependencies, live_info, &start_address);
if (ret != 6) {
cerr << "op_record_kernel_info: Bad /proc/modules entry: \n\t" << line << endl;
free(line);
continue;
}
mmap.header.type = PERF_RECORD_MMAP;
mmap.header.misc = PERF_RECORD_MISC_KERNEL;
size = strlen(module_name) + 1;
strncpy(mmap.filename, module_name, size);
size = align_64bit(size);
mmap.start = start_address;
mmap.len = module_size;
mmap.pid = 0;
mmap.tid = 0;
mmap.header.size = (sizeof(mmap) -
(sizeof(mmap.filename) - size));
int num = OP_perf_utils::op_write_output(output_fd, &mmap, mmap.header.size);
if (cverb << vrecord)
cout << "Created MMAP event for " << module_name << ". Size: "
<< module_size << "; start addr: " << start_address << endl;
pr->add_to_total(num);
free(line);
}
fclose(fp);
return;
}
void OP_perf_utils::op_record_kernel_info(string vmlinux_file, u64 start_addr, u64 end_addr,
int output_fd, operf_record * pr)
{
struct mmap_event mmap;
size_t size;
memset(&mmap, 0, sizeof(mmap));
mmap.pgoff = 0;
mmap.header.type = PERF_RECORD_MMAP;
mmap.header.misc = PERF_RECORD_MISC_KERNEL;
if (vmlinux_file.empty()) {
size = strlen( "no_vmlinux") + 1;
strncpy(mmap.filename, "no-vmlinux", size);
mmap.start = 0ULL;
mmap.len = 0ULL;
} else {
size = vmlinux_file.length() + 1;
strncpy(mmap.filename, vmlinux_file.c_str(), size);
mmap.start = start_addr;
mmap.len = end_addr - mmap.start;
}
size = align_64bit(size);
mmap.pid = 0;
mmap.tid = 0;
mmap.header.size = (sizeof(mmap) -
(sizeof(mmap.filename) - size));
int num = op_write_output(output_fd, &mmap, mmap.header.size);
if (cverb << vrecord) {
ostringstream message;
message << "Created MMAP event of size " << mmap.header.size << " for " <<mmap.filename << ". length: "
<< hex << mmap.len << "; start addr: " << mmap.start << endl;
cout << message.str();
}
pr->add_to_total(num);
_record_module_info(output_fd, pr);
}
void OP_perf_utils::op_get_kernel_event_data(struct mmap_data *md, operf_record * pr)
{
struct perf_event_mmap_page *pc = (struct perf_event_mmap_page *)md->base;
int out_fd = pr->out_fd();
uint64_t head = pc->data_head;
// Comment in perf_event.h says "User-space reading the @data_head value should issue
// an rmb(), on SMP capable platforms, after reading this value."
rmb();
uint64_t old = md->prev;
unsigned char *data = ((unsigned char *)md->base) + pagesize;
uint64_t size;
void *buf;
int64_t diff;
if (old == head)
return;
diff = head - old;
if (diff < 0) {
throw runtime_error("ERROR: event buffer wrapped, which should NEVER happen.");
}
if (old != head)
sample_reads++;
size = head - old;
if ((old & md->mask) + size != (head & md->mask)) {
buf = &data[old & md->mask];
size = md->mask + 1 - (old & md->mask);
old += size;
pr->add_to_total(op_write_output(out_fd, buf, size));
}
buf = &data[old & md->mask];
size = head - old;
old += size;
pr->add_to_total(op_write_output(out_fd, buf, size));
md->prev = old;
pc->data_tail = old;
}
int OP_perf_utils::op_get_next_online_cpu(DIR * dir, struct dirent *entry)
{
#define OFFLINE 0x30
unsigned int cpu_num;
char cpu_online_pathname[40];
int res;
FILE * online;
again:
do {
entry = readdir(dir);
if (!entry)
return -1;
} while (entry->d_type != DT_DIR);
res = sscanf(entry->d_name, "cpu%u", &cpu_num);
if (res <= 0)
goto again;
errno = 0;
snprintf(cpu_online_pathname, 40, "/sys/devices/system/cpu/cpu%u/online", cpu_num);
if ((online = fopen(cpu_online_pathname, "r")) == NULL) {
cerr << "Unable to open " << cpu_online_pathname << endl;
if (errno)
cerr << strerror(errno) << endl;
return -1;
}
res = fgetc(online);
fclose(online);
if (res == OFFLINE)
goto again;
else
return cpu_num;
}