blob: b4cceaa8eb6db78e7e6ff6f7bb0ba713ef601839 [file] [log] [blame]
/**
* @file pe_profiling/operf_counter.cpp
* C++ class implementation that abstracts the user-to-kernel interface
* for using Linux Performance Events Subsystem.
*
* @remark Copyright 2011 OProfile authors
* @remark Read the file COPYING
*
* Created on: Dec 7, 2011
* @author Maynard Johnson
* (C) Copyright IBM Corp. 2011
*
* Modified by Maynard Johnson <maynardj@us.ibm.com>
* (C) Copyright IBM Corporation 2012
*
*/
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <signal.h>
#include <errno.h>
#include <string.h>
#include <iostream>
#include <sstream>
#include <stdlib.h>
#include "op_events.h"
#include "operf_counter.h"
#include "op_abi.h"
#include "cverb.h"
#include "operf_process_info.h"
#include "op_libiberty.h"
#include "operf_stats.h"
using namespace std;
using namespace OP_perf_utils;
volatile bool quit;
int sample_reads;
int num_mmap_pages;
unsigned int pagesize;
verbose vrecord("record");
verbose vconvert("convert");
extern bool first_time_processing;
extern bool throttled;
extern size_t mmap_size;
extern size_t pg_sz;
extern bool use_cpu_minus_one;
namespace {
vector<string> event_names;
static const char *__op_magic = "OPFILE";
#define OP_MAGIC (*(u64 *)__op_magic)
/* This function for reading an event from the sample data pipe must
* be robust enough to handle the situation where the operf_record process
* writes an event record to the pipe in multiple chunks.
*/
#define OP_PIPE_READ_OK 0
#define OP_PIPE_CLOSED -1
static int _get_perf_event_from_pipe(event_t * event, int sample_data_fd)
{
static size_t pe_header_size = sizeof(perf_event_header);
size_t read_size = pe_header_size;
int rc = OP_PIPE_READ_OK;
char * evt = (char *)event;
ssize_t num_read;
perf_event_header * header = (perf_event_header *)event;
memset(header, '\0', pe_header_size);
/* A signal handler was setup for the operf_read process to handle interrupts
* (i.e., from ctrl-C), so the read syscalls below may get interrupted. But the
* operf_read process should ignore the interrupt and continue processing
* until there's no more data to read or until the parent operf process
* forces us to stop. So we must try the read operation again if it was
* interrupted.
*/
again:
errno = 0;
if ((num_read = read(sample_data_fd, header, read_size)) < 0) {
cverb << vdebug << "Read 1 of sample data pipe returned with " << strerror(errno) << endl;
if (errno == EINTR) {
goto again;
} else {
rc = OP_PIPE_CLOSED;
goto out;
}
} else if (num_read == 0) {
// Implies pipe has been closed on the write end, so return -1 to quit reading
rc = OP_PIPE_CLOSED;
goto out;
} else if (num_read != read_size) {
header += num_read;
read_size -= num_read;
goto again;
}
read_size = header->size - pe_header_size;
if (read_size == 0)
/* This is technically a valid record -- it's just empty. I'm not
* sure if this can happen (i.e., if the kernel ever creates empty
* records), but we'll handle it just in case.
*/
goto again;
if (!header->size || (header->size < pe_header_size))
/* Bogus header size detected. In this case, we don't set rc to -1,
* because the caller will catch this error when it calls is_header_valid().
* I've seen such bogus stuff occur when profiling lots of processes at
* a very high sampling frequency. This issue is still being investigated,
* so for now, we'll just do our best to detect and handle gracefully.
*/
goto out;
evt += pe_header_size;
again2:
if ((num_read = read(sample_data_fd, evt, read_size)) < 0) {
cverb << vdebug << "Read 2 of sample data pipe returned with " << strerror(errno) << endl;
if (errno == EINTR) {
goto again2;
} else {
rc = OP_PIPE_CLOSED;
if (errno == EFAULT)
cerr << "Size of event record: " << header->size << endl;
goto out;
}
} else if (num_read == 0) {
// Implies pipe has been closed on the write end, so return -1 to quit reading
rc = OP_PIPE_CLOSED;
goto out;
} else if (num_read != read_size) {
evt += num_read;
read_size -= num_read;
goto again;
}
out:
return rc;
}
static event_t * _get_perf_event_from_file(struct mmap_info & info)
{
uint32_t size = 0;
static int num_remaps = 0;
event_t * event;
size_t pe_header_size = sizeof(struct perf_event_header);
try_again:
event = NULL;
if (info.offset + info.head + pe_header_size > info.file_data_size)
goto out;
if (info.head + pe_header_size <= mmap_size)
event = (event_t *)(info.buf + info.head);
if (unlikely(!event || (info.head + event->header.size > mmap_size))) {
int ret;
u64 shift = pg_sz * (info.head / pg_sz);
cverb << vdebug << "Remapping perf data file: " << dec << ++num_remaps << endl;
ret = munmap(info.buf, mmap_size);
if (ret) {
string errmsg = "Internal error: munmap of perf data file failed with errno: ";
errmsg += strerror(errno);
throw runtime_error(errmsg);
}
info.offset += shift;
info.head -= shift;
ret = op_mmap_trace_file(info, false);
if (ret) {
string errmsg = "Internal error: mmap of perf data file failed with errno: ";
errmsg += strerror(errno);
throw runtime_error(errmsg);
}
goto try_again;
}
size = event->header.size;
info.head += size;
out:
if (unlikely(!event)) {
cverb << vdebug << "No more event records in file. info.offset: " << dec << info.offset
<< "; info.head: " << info.head << "; info.file_data_size: " << info.file_data_size
<< endl << "; mmap_size: " << mmap_size << "; current record size: " << size << endl;
}
return event;
}
} // end anonymous namespace
operf_counter::operf_counter(operf_event_t & evt, bool enable_on_exec, bool do_cg,
bool separate_cpu, bool inherit, int event_number)
{
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.sample_type = OP_BASIC_SAMPLE_FORMAT;
if (do_cg)
attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
if (separate_cpu)
attr.sample_type |= PERF_SAMPLE_CPU;
attr.type = PERF_TYPE_RAW;
#if ((defined(__i386__) || defined(__x86_64__)) && (HAVE_PERF_PRECISE_IP))
if (evt.evt_code & EXTRA_PEBS) {
attr.precise_ip = 2;
evt.evt_code ^= EXTRA_PEBS;
}
#endif
attr.config = evt.evt_code;
attr.sample_period = evt.count;
attr.inherit = inherit ? 1 : 0;
attr.enable_on_exec = enable_on_exec ? 1 : 0;
attr.disabled = 1;
attr.exclude_idle = 0;
attr.exclude_kernel = evt.no_kernel;
attr.exclude_hv = evt.no_hv;
attr.read_format = PERF_FORMAT_ID;
event_name = evt.name;
fd = id = -1;
evt_num = event_number;
}
operf_counter::~operf_counter() {
}
int operf_counter::perf_event_open(pid_t pid, int cpu, operf_record * rec)
{
struct {
u64 count;
u64 id;
} read_data;
if (evt_num == 0) {
attr.mmap = 1;
attr.comm = 1;
}
fd = op_perf_event_open(&attr, pid, cpu, -1, 0);
if (fd < 0) {
int ret = -1;
cverb << vrecord << "perf_event_open failed: " << strerror(errno) << endl;
if (errno == EBUSY) {
cerr << "The performance monitoring hardware reports EBUSY. Is another profiling tool in use?" << endl
<< "On some architectures, tools such as oprofile and perf being used in system-wide "
<< "mode can cause this problem." << endl;
ret = OP_PERF_HANDLED_ERROR;
} else if (errno == ESRCH) {
cerr << "!!!! No samples collected !!!" << endl;
cerr << "The target program/command ended before profiling was started." << endl;
ret = OP_PERF_HANDLED_ERROR;
} else {
cerr << "perf_event_open failed with " << strerror(errno) << endl;
}
return ret;
}
if (read(fd, &read_data, sizeof(read_data)) == -1) {
perror("Error reading perf_event fd");
return -1;
}
rec->register_perf_event_id(evt_num, read_data.id, attr);
cverb << vrecord << "perf_event_open returning fd " << fd << endl;
return fd;
}
operf_record::~operf_record()
{
cverb << vrecord << "operf_record::~operf_record()" << endl;
opHeader.data_size = total_bytes_recorded;
// If recording to a file, we re-write the op_header info
// in order to update the data_size field.
if (total_bytes_recorded && write_to_file)
write_op_header_info();
if (poll_data)
delete[] poll_data;
for (int i = 0; i < samples_array.size(); i++) {
struct mmap_data *md = &samples_array[i];
munmap(md->base, (num_mmap_pages + 1) * pagesize);
}
samples_array.clear();
evts.clear();
perfCounters.clear();
/* Close output_fd last. If sample data was being written to a pipe, we want
* to give the pipe reader (i.e., operf_read::convertPerfData) as much time
* as possible in order to drain the pipe of any remaining data.
*/
close(output_fd);
}
operf_record::operf_record(int out_fd, bool sys_wide, pid_t the_pid, bool pid_running,
vector<operf_event_t> & events, vmlinux_info_t vi, bool do_cg,
bool separate_by_cpu, bool out_fd_is_file)
{
int flags = O_CREAT|O_RDWR|O_TRUNC;
struct sigaction sa;
sigset_t ss;
vmlinux_file = vi.image_name;
kernel_start = vi.start;
kernel_end = vi.end;
pid_to_profile = the_pid;
pid_started = pid_running;
system_wide = sys_wide;
callgraph = do_cg;
separate_cpu = separate_by_cpu;
total_bytes_recorded = 0;
poll_count = 0;
evts = events;
valid = false;
poll_data = NULL;
output_fd = out_fd;
write_to_file = out_fd_is_file;
opHeader.data_size = 0;
num_cpus = -1;
if (system_wide && (pid_to_profile != -1 || pid_started))
return; // object is not valid
cverb << vrecord << "operf_record ctor using output fd " << output_fd << endl;
memset(&sa, 0, sizeof(struct sigaction));
sa.sa_sigaction = op_perfrecord_sigusr1_handler;
sigemptyset(&sa.sa_mask);
sigemptyset(&ss);
sigaddset(&ss, SIGUSR1);
sigprocmask(SIG_UNBLOCK, &ss, NULL);
sa.sa_mask = ss;
sa.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
cverb << vrecord << "calling sigaction" << endl;
if (sigaction(SIGUSR1, &sa, NULL) == -1) {
cverb << vrecord << "operf_record ctor: sigaction failed; errno is: "
<< strerror(errno) << endl;
_exit(EXIT_FAILURE);
}
cverb << vrecord << "calling setup" << endl;
setup();
}
int operf_record::_write_header_to_file(void)
{
struct OP_file_header f_header;
struct op_file_attr f_attr;
int total = 0;
if (lseek(output_fd, sizeof(f_header), SEEK_SET) == (off_t)-1)
goto err_out;
for (unsigned i = 0; i < evts.size(); i++) {
opHeader.h_attrs[i].id_offset = lseek(output_fd, 0, SEEK_CUR);
if (opHeader.h_attrs[i].id_offset == (off_t)-1)
goto err_out;
total += op_write_output(output_fd, &opHeader.h_attrs[i].ids[0],
opHeader.h_attrs[i].ids.size() * sizeof(u64));
}
opHeader.attr_offset = lseek(output_fd, 0, SEEK_CUR);
if (opHeader.attr_offset == (off_t)-1)
goto err_out;
for (unsigned i = 0; i < evts.size(); i++) {
struct op_header_evt_info attr = opHeader.h_attrs[i];
f_attr.attr = attr.attr;
f_attr.ids.offset = attr.id_offset;
f_attr.ids.size = attr.ids.size() * sizeof(u64);
total += op_write_output(output_fd, &f_attr, sizeof(f_attr));
}
opHeader.data_offset = lseek(output_fd, 0, SEEK_CUR);
if (opHeader.data_offset == (off_t)-1)
goto err_out;
f_header.magic = OP_MAGIC;
f_header.size = sizeof(f_header);
f_header.attr_size = sizeof(f_attr);
f_header.attrs.offset = opHeader.attr_offset;
f_header.attrs.size = evts.size() * sizeof(f_attr);
f_header.data.offset = opHeader.data_offset;
f_header.data.size = opHeader.data_size;
if (lseek(output_fd, 0, SEEK_SET) == (off_t)-1)
goto err_out;
total += op_write_output(output_fd, &f_header, sizeof(f_header));
if (lseek(output_fd, opHeader.data_offset + opHeader.data_size, SEEK_SET) == (off_t)-1)
goto err_out;
return total;
err_out:
string errmsg = "Internal error doing lseek: ";
errmsg += strerror(errno);
throw runtime_error(errmsg);
}
int operf_record::_write_header_to_pipe(void)
{
struct OP_file_header f_header;
struct op_file_attr f_attr;
int total;
f_header.magic = OP_MAGIC;
f_header.size = sizeof(f_header);
f_header.attr_size = sizeof(f_attr);
f_header.attrs.size = evts.size() * sizeof(f_attr);
f_header.data.size = 0;
total = op_write_output(output_fd, &f_header, sizeof(f_header));
for (unsigned i = 0; i < evts.size(); i++) {
struct op_header_evt_info attr = opHeader.h_attrs[i];
f_attr.attr = attr.attr;
f_attr.ids.size = attr.ids.size() * sizeof(u64);
total += op_write_output(output_fd, &f_attr, sizeof(f_attr));
}
for (unsigned i = 0; i < evts.size(); i++) {
total += op_write_output(output_fd, &opHeader.h_attrs[i].ids[0],
opHeader.h_attrs[i].ids.size() * sizeof(u64));
}
return total;
}
void operf_record::register_perf_event_id(unsigned event, u64 id, perf_event_attr attr)
{
// It's overkill to blindly do this assignment below every time, since this function
// is invoked once for each event for each cpu; but it's not worth the bother of trying
// to avoid it.
opHeader.h_attrs[event].attr = attr;
ostringstream message;
message << "Perf header: id = " << hex << (unsigned long long)id << " for event num "
<< event << ", code " << attr.config << endl;
cverb << vrecord << message.str();
opHeader.h_attrs[event].ids.push_back(id);
}
void operf_record::write_op_header_info()
{
if (write_to_file)
add_to_total(_write_header_to_file());
else
add_to_total(_write_header_to_pipe());
}
int operf_record::_prepare_to_record_one_fd(int idx, int fd)
{
struct mmap_data md;
md.prev = 0;
md.mask = num_mmap_pages * pagesize - 1;
if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
perror("fcntl failed");
return -1;
}
poll_data[idx].fd = fd;
poll_data[idx].events = POLLIN;
poll_count++;
md.base = mmap(NULL, (num_mmap_pages + 1) * pagesize,
PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (md.base == MAP_FAILED) {
if (errno == EPERM) {
cerr << "Failed to mmap kernel profile data." << endl;
cerr << "This issue may be caused by a non-root user running multiple operf" << endl;
cerr << "sessions simultaneously. Try running as root or increasing the value of" << endl;
cerr << "/proc/sys/kernel/perf_event_mlock_kb to resolve the problem." << endl << endl;
return OP_PERF_HANDLED_ERROR;
} else {
perror("failed to mmap");
}
return -1;
}
samples_array.push_back(md);
return 0;
}
int operf_record::prepareToRecord(void)
{
int op_ctr_idx = 0;
int rc = 0;
errno = 0;
if (pid_started && (procs.size() > 1)) {
/* Implies we're profiling a thread group, where we call perf_event_open
* on each thread (process) in the group, passing cpu=-1. So we'll do
* one mmap per thread (by way of the _prepare_to_record_one_fd function).
* If more than one event has been specified to profile on, we just do an
* ioctl PERF_EVENT_IOC_SET_OUTPUT to tie that perf_event fd with the fd
* of the first event of the thread.
*/
// Sanity check
if ((procs.size() * evts.size()) != perfCounters.size()) {
cerr << "Internal error: Number of fds[] (" << perfCounters.size()
<< ") != number of processes x number of events ("
<< procs.size() << " x " << evts.size() << ")." << endl;
return -1;
}
for (unsigned int proc_idx = 0; proc_idx < procs.size(); proc_idx++) {
int fd_for_set_output = perfCounters[op_ctr_idx].get_fd();
for (unsigned event = 0; event < evts.size(); event++) {
int fd = perfCounters[op_ctr_idx].get_fd();
if (event == 0) {
rc = _prepare_to_record_one_fd(proc_idx, fd);
} else {
if ((rc = ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
fd_for_set_output)) < 0)
perror("prepareToRecord: ioctl #1 failed");
}
if (rc < 0)
return rc;
if ((rc = ioctl(fd, PERF_EVENT_IOC_ENABLE)) < 0) {
perror("prepareToRecord: ioctl #2 failed");
return rc;
}
op_ctr_idx++;
}
}
} else {
/* We're either doing a system-wide profile or a profile of a single process.
* We'll do one mmap per cpu. If more than one event has been specified
* to profile on, we just do an ioctl PERF_EVENT_IOC_SET_OUTPUT to tie
* that perf_event fd with the fd of the first event of the cpu.
*/
if ((num_cpus * evts.size()) != perfCounters.size()) {
cerr << "Internal error: Number of fds[] (" << perfCounters.size()
<< ") != number of cpus x number of events ("
<< num_cpus << " x " << evts.size() << ")." << endl;
return -1;
}
for (unsigned int cpu = 0; cpu < num_cpus; cpu++) {
int fd_for_set_output = perfCounters[op_ctr_idx].get_fd();
for (unsigned event = 0; event < evts.size(); event++) {
int fd = perfCounters[op_ctr_idx].get_fd();
if (event == 0) {
rc = _prepare_to_record_one_fd(cpu, fd);
} else {
if ((rc = ioctl(fd, PERF_EVENT_IOC_SET_OUTPUT,
fd_for_set_output)) < 0)
perror("prepareToRecord: ioctl #3 failed");
}
if (rc < 0)
return rc;
if ((rc = ioctl(fd, PERF_EVENT_IOC_ENABLE)) < 0) {
perror("prepareToRecord: ioctl #4 failed");
return rc;
}
op_ctr_idx++;
}
}
}
return rc;
}
void operf_record::setup()
{
bool all_cpus_avail = true;
int rc = 0;
struct dirent *entry = NULL;
DIR *dir = NULL;
string err_msg;
char cpus_online[257];
bool profile_process_group = false;
if (system_wide)
cverb << vrecord << "operf_record::setup() for system-wide profiling" << endl;
else
cverb << vrecord << "operf_record::setup() with pid_started = " << pid_started << endl;
if (pid_started || system_wide) {
if ((rc = op_get_process_info(system_wide, pid_to_profile, this)) < 0) {
if (rc == OP_PERF_HANDLED_ERROR)
return;
else
throw runtime_error("Unexpected error in operf_record setup");
}
// 'pid_started && (procs.size() > 1)' implies the process that the user
// has requested us to profile has cloned one or more children.
profile_process_group = pid_started && (procs.size() > 1);
}
pagesize = sysconf(_SC_PAGE_SIZE);
// If profiling a process group, use a smaller mmap length to avoid EINVAL.
num_mmap_pages = profile_process_group ? 1 : (512 * 1024)/pagesize;
/* To set up to profile an existing thread group, we need call perf_event_open
* for each thread, and we need to pass cpu=-1 on the syscall.
*/
use_cpu_minus_one = use_cpu_minus_one ? true : profile_process_group;
num_cpus = use_cpu_minus_one ? 1 : sysconf(_SC_NPROCESSORS_ONLN);
if (num_cpus < 1) {
char int_str[256];
sprintf(int_str, "Number of online CPUs is %d; cannot continue", num_cpus);
throw runtime_error(int_str);
}
cverb << vrecord << "calling perf_event_open for pid " << pid_to_profile << " on "
<< num_cpus << " cpus" << endl;
FILE * online_cpus = fopen("/sys/devices/system/cpu/online", "r");
if (!online_cpus) {
err_msg = "Internal Error: Number of online cpus cannot be determined.";
rc = -1;
goto error;
}
memset(cpus_online, 0, sizeof(cpus_online));
fgets(cpus_online, sizeof(cpus_online), online_cpus);
if (!cpus_online[0]) {
fclose(online_cpus);
err_msg = "Internal Error: Number of online cpus cannot be determined.";
rc = -1;
goto error;
}
if (index(cpus_online, ',') || cpus_online[0] != '0') {
all_cpus_avail = false;
if ((dir = opendir("/sys/devices/system/cpu")) == NULL) {
fclose(online_cpus);
err_msg = "Internal Error: Number of online cpus cannot be determined.";
rc = -1;
goto error;
}
}
fclose(online_cpus);
for (int cpu = 0; cpu < num_cpus; cpu++) {
int real_cpu;
int mmap_fd;
if (use_cpu_minus_one) {
real_cpu = -1;
} else if (all_cpus_avail) {
real_cpu = cpu;
} else {
real_cpu = op_get_next_online_cpu(dir, entry);
if (real_cpu < 0) {
err_msg = "Internal Error: Number of online cpus cannot be determined.";
rc = -1;
goto error;
}
}
size_t num_procs = profile_process_group ? procs.size() : 1;
/* To profile a parent and its children, the perf_events kernel subsystem
* requires us to use cpu=-1 on the perf_event_open call for each of the
* processes in the group. But perf_events also prevents us from specifying
* "inherit" on the perf_event_attr we pass to perf_event_open when cpu is '-1'.
*/
bool inherit = !profile_process_group;
for (unsigned proc_idx = 0; proc_idx < num_procs; proc_idx++) {
for (unsigned event = 0; event < evts.size(); event++) {
/* For a parent process, comm.tid==comm.pid, but for child
* processes in a process group, comm.pid is the parent, so
* we must use comm.tid for the perf_event_open call. So
* we can use comm.tid for all cases.
*/
pid_t pid_for_open = profile_process_group ? procs[proc_idx].tid
: pid_to_profile;
operf_counter op_ctr(operf_counter(evts[event],
(!pid_started && !system_wide),
callgraph, separate_cpu,
inherit, event));
if ((rc = op_ctr.perf_event_open(pid_for_open,
real_cpu, this)) < 0) {
err_msg = "Internal Error. Perf event setup failed.";
goto error;
}
perfCounters.push_back(op_ctr);
}
}
}
int num_mmaps;
if (pid_started && (procs.size() > 1))
num_mmaps = procs.size();
else
num_mmaps = num_cpus;
poll_data = new struct pollfd [num_mmaps];
if ((rc = prepareToRecord()) < 0) {
err_msg = "Internal Error. Perf event setup failed.";
goto error;
}
write_op_header_info();
// Set bit to indicate we're set to go.
valid = true;
if (dir)
closedir(dir);
return;
error:
delete[] poll_data;
poll_data = NULL;
for (int i = 0; i < samples_array.size(); i++) {
struct mmap_data *md = &samples_array[i];
munmap(md->base, (num_mmap_pages + 1) * pagesize);
}
samples_array.clear();
if (dir)
closedir(dir);
close(output_fd);
if (rc != OP_PERF_HANDLED_ERROR)
throw runtime_error(err_msg);
}
void operf_record::record_process_info(void)
{
map<unsigned int, unsigned int> pids_mapped;
pid_t last_tgid = -1;
for (unsigned int proc_idx = 0; proc_idx < procs.size(); proc_idx++)
{
int num = OP_perf_utils::op_write_output(output_fd, &procs[proc_idx],
procs[proc_idx].header.size);
add_to_total(num);
if (cverb << vrecord)
cout << "Created COMM event for " << procs[proc_idx].comm << endl;
if ((procs[proc_idx].pid == last_tgid) ||
(pids_mapped.find(procs[proc_idx].pid) != pids_mapped.end()))
continue;
OP_perf_utils::op_record_process_exec_mmaps(procs[proc_idx].tid,
procs[proc_idx].pid,
output_fd, this);
pids_mapped[procs[proc_idx].pid] = last_tgid = procs[proc_idx].pid;
}
}
void operf_record::recordPerfData(void)
{
bool disabled = false;
if (pid_started || system_wide)
record_process_info();
op_record_kernel_info(vmlinux_file, kernel_start, kernel_end, output_fd, this);
cerr << "operf: Profiler started" << endl;
while (1) {
int prev = sample_reads;
for (int i = 0; i < samples_array.size(); i++) {
if (samples_array[i].base)
op_get_kernel_event_data(&samples_array[i], this);
}
if (quit && disabled)
break;
if (prev == sample_reads) {
(void)poll(poll_data, poll_count, -1);
}
if (quit) {
for (unsigned int i = 0; i < perfCounters.size(); i++)
ioctl(perfCounters[i].get_fd(), PERF_EVENT_IOC_DISABLE);
disabled = true;
cverb << vrecord << "operf_record::recordPerfData received signal to quit." << endl;
}
}
cverb << vdebug << "operf recording finished." << endl;
}
void operf_read::init(int sample_data_pipe_fd, string input_filename, string samples_loc, op_cpu cputype,
vector<operf_event_t> & events, bool systemwide)
{
sample_data_fd = sample_data_pipe_fd;
inputFname = input_filename;
sampledir = samples_loc;
evts = events;
cpu_type = cputype;
syswide = systemwide;
}
operf_read::~operf_read()
{
evts.clear();
}
int operf_read::_read_header_info_with_ifstream(void)
{
struct OP_file_header fheader;
int num_fattrs, ret = 0;
size_t fattr_size;
istrm.seekg(0, ios_base::beg);
if (op_read_from_stream(istrm, (char *)&fheader, sizeof(fheader)) != sizeof(fheader)) {
cerr << "Error: input file " << inputFname << " does not have enough data for header" << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
if (memcmp(&fheader.magic, __op_magic, sizeof(fheader.magic))) {
cerr << "Error: input file " << inputFname << " does not have expected header data" << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
cverb << vconvert << "operf magic number " << (char *)&fheader.magic << " matches expected __op_magic " << __op_magic << endl;
opHeader.attr_offset = fheader.attrs.offset;
opHeader.data_offset = fheader.data.offset;
opHeader.data_size = fheader.data.size;
fattr_size = sizeof(struct op_file_attr);
if (fattr_size != fheader.attr_size) {
cerr << "Error: perf_events binary incompatibility. Event data collection was apparently "
<< endl << "performed under a different kernel version than current." << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
num_fattrs = fheader.attrs.size/fheader.attr_size;
cverb << vconvert << "num_fattrs is " << num_fattrs << endl;
istrm.seekg(opHeader.attr_offset, ios_base::beg);
for (int i = 0; i < num_fattrs; i++) {
struct op_file_attr f_attr;
streamsize fattr_size = sizeof(f_attr);
if (op_read_from_stream(istrm, (char *)&f_attr, fattr_size) != fattr_size) {
cerr << "Error: Unexpected end of input file " << inputFname << "." << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
opHeader.h_attrs[i].attr = f_attr.attr;
streampos next_f_attr = istrm.tellg();
int num_ids = f_attr.ids.size/sizeof(u64);
istrm.seekg(f_attr.ids.offset, ios_base::beg);
for (int id = 0; id < num_ids; id++) {
u64 perf_id;
streamsize perfid_size = sizeof(perf_id);
if (op_read_from_stream(istrm, (char *)& perf_id, perfid_size) != perfid_size) {
cerr << "Error: Unexpected end of input file " << inputFname << "." << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
ostringstream message;
message << "Perf header: id = " << hex << (unsigned long long)perf_id << endl;
cverb << vconvert << message.str();
opHeader.h_attrs[i].ids.push_back(perf_id);
}
istrm.seekg(next_f_attr, ios_base::beg);
}
out:
istrm.close();
return ret;
}
int operf_read::_read_perf_header_from_file(void)
{
int ret = 0;
opHeader.data_size = 0;
istrm.open(inputFname.c_str(), ios_base::in);
if (!istrm.good()) {
valid = false;
cerr << "Input stream bad for " << inputFname << endl;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
istrm.peek();
if (istrm.eof()) {
cverb << vconvert << "operf_read::readPerfHeader: Empty profile data file." << endl;
valid = false;
ret = OP_PERF_HANDLED_ERROR;
goto out;
}
cverb << vconvert << "operf_read: successfully opened input file " << inputFname << endl;
if ((ret = _read_header_info_with_ifstream()) == 0) {
valid = true;
cverb << vconvert << "Successfully read perf header" << endl;
} else {
valid = false;
}
out:
return ret;
}
int operf_read::_read_perf_header_from_pipe(void)
{
struct OP_file_header fheader;
string errmsg;
int num_fattrs;
size_t fattr_size;
vector<struct op_file_attr> f_attr_cache;
errno = 0;
if (read(sample_data_fd, &fheader, sizeof(fheader)) != sizeof(fheader)) {
errmsg = "Error reading header on sample data pipe: " + string(strerror(errno));
goto fail;
}
if (memcmp(&fheader.magic, __op_magic, sizeof(fheader.magic))) {
errmsg = "Error: operf sample data does not have expected header data";
goto fail;
}
cverb << vconvert << "operf magic number " << (char *)&fheader.magic << " matches expected __op_magic " << __op_magic << endl;
fattr_size = sizeof(struct op_file_attr);
if (fattr_size != fheader.attr_size) {
errmsg = "Error: perf_events binary incompatibility. Event data collection was apparently "
"performed under a different kernel version than current.";
goto fail;
}
num_fattrs = fheader.attrs.size/fheader.attr_size;
cverb << vconvert << "num_fattrs is " << num_fattrs << endl;
for (int i = 0; i < num_fattrs; i++) {
struct op_file_attr f_attr;
streamsize fattr_size = sizeof(f_attr);
if (read(sample_data_fd, (char *)&f_attr, fattr_size) != fattr_size) {
errmsg = "Error reading file attr on sample data pipe: " + string(strerror(errno));
goto fail;
}
opHeader.h_attrs[i].attr = f_attr.attr;
f_attr_cache.push_back(f_attr);
}
for (int i = 0; i < num_fattrs; i++) {
vector<struct op_file_attr>::iterator it = f_attr_cache.begin();
struct op_file_attr f_attr = *(it);
int num_ids = f_attr.ids.size/sizeof(u64);
for (int id = 0; id < num_ids; id++) {
u64 perf_id;
streamsize perfid_size = sizeof(perf_id);
if (read(sample_data_fd, (char *)& perf_id, perfid_size) != perfid_size) {
errmsg = "Error reading perf ID on sample data pipe: " + string(strerror(errno));
goto fail;
}
ostringstream message;
message << "Perf header: id = " << hex << (unsigned long long)perf_id << endl;
cverb << vconvert << message.str();
opHeader.h_attrs[i].ids.push_back(perf_id);
}
}
valid = true;
cverb << vconvert << "Successfully read perf header" << endl;
return 0;
fail:
cerr << errmsg;
return OP_PERF_HANDLED_ERROR;
}
int operf_read::readPerfHeader(void)
{
if (!inputFname.empty())
return _read_perf_header_from_file();
else
return _read_perf_header_from_pipe();
}
int operf_read::get_eventnum_by_perf_event_id(u64 id) const
{
for (unsigned i = 0; i < evts.size(); i++) {
struct op_header_evt_info attr = opHeader.h_attrs[i];
for (unsigned j = 0; j < attr.ids.size(); j++) {
if (attr.ids[j] == id)
return i;
}
}
return -1;
}
unsigned int operf_read::convertPerfData(void)
{
unsigned int num_bytes = 0;
struct mmap_info info;
bool error = false;
event_t * event;
if (!inputFname.empty()) {
info.file_data_offset = opHeader.data_offset;
info.file_data_size = opHeader.data_size;
cverb << vdebug << "Expecting to read approximately " << dec
<< info.file_data_size - info.file_data_offset
<< " bytes from operf sample data file." << endl;
info.traceFD = open(inputFname.c_str(), O_RDONLY);
if (info.traceFD == -1) {
cerr << "Error: open failed with errno:\n\t" << strerror(errno) << endl;
throw runtime_error("Error: Unable to open operf data file");
}
cverb << vdebug << "operf_read opened " << inputFname << endl;
pg_sz = sysconf(_SC_PAGESIZE);
if (op_mmap_trace_file(info, true) < 0) {
close(info.traceFD);
throw runtime_error("Error: Unable to mmap operf data file");
}
} else {
// Allocate way more than enough space for a really big event with a long callchain
event = (event_t *)xmalloc(65536);
memset(event, '\0', 65536);
}
for (int i = 0; i < OPERF_MAX_STATS; i++)
operf_stats[i] = 0;
ostringstream message;
message << "Converting operf data to oprofile sample data format" << endl;
message << "sample type is " << hex << opHeader.h_attrs[0].attr.sample_type << endl;
cverb << vdebug << message.str();
first_time_processing = true;
int num_recs = 0;
struct perf_event_header last_header;
bool print_progress = !inputFname.empty() && syswide;
if (print_progress)
cerr << "Converting profile data to OProfile format" << endl;
while (1) {
streamsize rec_size = 0;
if (!inputFname.empty()) {
event = _get_perf_event_from_file(info);
if (event == NULL)
break;
} else {
if (_get_perf_event_from_pipe(event, sample_data_fd) < 0)
break;
}
rec_size = event->header.size;
if ((!is_header_valid(event->header)) ||
((op_write_event(event, opHeader.h_attrs[0].attr.sample_type)) < 0)) {
error = true;
last_header = event->header;
break;
}
num_bytes += rec_size;
num_recs++;
if ((num_recs % 1000000 == 0) && print_progress)
cerr << ".";
}
if (unlikely(error)) {
if (!inputFname.empty()) {
cerr << "ERROR: operf_read::convertPerfData quitting. Bad data read from file." << endl;
} else {
cerr << "ERROR: operf_read::convertPerfData quitting. Bad data read from pipe." << endl;
cerr << "Closing read end of data pipe. operf-record process will stop with SIGPIPE (13)."
<< endl;
}
cerr << "Try lowering the sample frequency to avoid this error; e.g., double the 'count'"
<< endl << "value in your event specification." << endl;
cverb << vdebug << "Event header type: " << last_header.type << "; size: " << last_header.size << endl;
}
first_time_processing = false;
if (!error)
op_reprocess_unresolved_events(opHeader.h_attrs[0].attr.sample_type, print_progress);
if (print_progress)
cerr << endl;
op_release_resources();
operf_print_stats(operf_options::session_dir, start_time_human_readable, throttled, evts);
char * cbuf;
cbuf = (char *)xmalloc(operf_options::session_dir.length() + 5);
strcpy(cbuf, operf_options::session_dir.c_str());
strcat(cbuf, "/abi");
op_write_abi_to_file(cbuf);
free(cbuf);
if (!inputFname.empty())
close(info.traceFD);
else
free(event);
return num_bytes;
}