nest-open-source / nest-cam / 4320010 / faux-folly / 105bcde253ca75a43c431b7249cfa1f21a5cf1d7 / . / faux-folly / folly / stats / TimeseriesHistogram.h

/* | |

* Copyright 2015 Facebook, Inc. | |

* | |

* Licensed under the Apache License, Version 2.0 (the "License"); | |

* you may not use this file except in compliance with the License. | |

* You may obtain a copy of the License at | |

* | |

* http://www.apache.org/licenses/LICENSE-2.0 | |

* | |

* Unless required by applicable law or agreed to in writing, software | |

* distributed under the License is distributed on an "AS IS" BASIS, | |

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |

* See the License for the specific language governing permissions and | |

* limitations under the License. | |

*/ | |

#ifndef FOLLY_TIMESERIES_HISTOGRAM_H_ | |

#define FOLLY_TIMESERIES_HISTOGRAM_H_ | |

#include <string> | |

#include <boost/static_assert.hpp> | |

#include <folly/stats/Histogram.h> | |

#include <folly/stats/MultiLevelTimeSeries.h> | |

namespace folly { | |

/* | |

* TimeseriesHistogram tracks data distributions as they change over time. | |

* | |

* Specifically, it is a bucketed histogram with different value ranges assigned | |

* to each bucket. Within each bucket is a MultiLevelTimeSeries from | |

* 'folly/stats/MultiLevelTimeSeries.h'. This means that each bucket contains a | |

* different set of data for different historical time periods, and one can | |

* query data distributions over different trailing time windows. | |

* | |

* For example, this can answer questions: "What is the data distribution over | |

* the last minute? Over the last 10 minutes? Since I last cleared this | |

* histogram?" | |

* | |

* The class can also estimate percentiles and answer questions like: "What was | |

* the 99th percentile data value over the last 10 minutes?" | |

* | |

* Note: that depending on the size of your buckets and the smoothness | |

* of your data distribution, the estimate may be way off from the actual | |

* value. In particular, if the given percentile falls outside of the bucket | |

* range (i.e. your buckets range in 0 - 100,000 but the 99th percentile is | |

* around 115,000) this estimate may be very wrong. | |

* | |

* The memory usage for a typical histogram is roughly 3k * (# of buckets). All | |

* insertion operations are amortized O(1), and all queries are O(# of buckets). | |

*/ | |

template <class T, class TT=std::chrono::seconds, | |

class C=folly::MultiLevelTimeSeries<T, TT>> | |

class TimeseriesHistogram { | |

private: | |

// NOTE: T must be equivalent to _signed_ numeric type for our math. | |

BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_signed); | |

public: | |

// values to be inserted into container | |

typedef T ValueType; | |

// the container type we use internally for each bucket | |

typedef C ContainerType; | |

// The time type. | |

typedef TT TimeType; | |

/* | |

* Create a TimeSeries histogram and initialize the bucketing and levels. | |

* | |

* The buckets are created by chopping the range [min, max) into pieces | |

* of size bucketSize, with the last bucket being potentially shorter. Two | |

* additional buckets are always created -- the "under" bucket for the range | |

* (-inf, min) and the "over" bucket for the range [max, +inf). | |

* | |

* @param bucketSize the width of each bucket | |

* @param min the smallest value for the bucket range. | |

* @param max the largest value for the bucket range | |

* @param defaultContainer a pre-initialized timeseries with the desired | |

* number of levels and their durations. | |

*/ | |

TimeseriesHistogram(ValueType bucketSize, ValueType min, ValueType max, | |

const ContainerType& defaultContainer); | |

/* Return the bucket size of each bucket in the histogram. */ | |

ValueType getBucketSize() const { return buckets_.getBucketSize(); } | |

/* Return the min value at which bucketing begins. */ | |

ValueType getMin() const { return buckets_.getMin(); } | |

/* Return the max value at which bucketing ends. */ | |

ValueType getMax() const { return buckets_.getMax(); } | |

/* Return the number of levels of the Timeseries object in each bucket */ | |

int getNumLevels() const { | |

return buckets_.getByIndex(0).numLevels(); | |

} | |

/* Return the number of buckets */ | |

int getNumBuckets() const { return buckets_.getNumBuckets(); } | |

/* Return the bucket index into which the given value would fall. */ | |

int getBucketIdx(const ValueType& value) const; | |

/* | |

* Return the threshold of the bucket for the given index in range | |

* [0..numBuckets). The bucket will have range [thresh, thresh + bucketSize) | |

* or [thresh, max), whichever is shorter. | |

*/ | |

ValueType getBucketMin(int bucketIdx) const { | |

return buckets_.getBucketMin(bucketIdx); | |

} | |

/* Return the actual timeseries in the given bucket (for reading only!) */ | |

const ContainerType& getBucket(int bucketIdx) const { | |

return buckets_.getByIndex(bucketIdx); | |

} | |

/* Total count of values at the given timeseries level (all buckets). */ | |

int64_t count(int level) const { | |

int64_t total = 0; | |

for (unsigned int b = 0; b < buckets_.getNumBuckets(); ++b) { | |

total += buckets_.getByIndex(b).count(level); | |

} | |

return total; | |

} | |

/* Total count of values added during the given interval (all buckets). */ | |

int64_t count(TimeType start, TimeType end) const { | |

int64_t total = 0; | |

for (unsigned int b = 0; b < buckets_.getNumBuckets(); ++b) { | |

total += buckets_.getByIndex(b).count(start, end); | |

} | |

return total; | |

} | |

/* Total sum of values at the given timeseries level (all buckets). */ | |

ValueType sum(int level) const { | |

ValueType total = ValueType(); | |

for (unsigned int b = 0; b < buckets_.getNumBuckets(); ++b) { | |

total += buckets_.getByIndex(b).sum(level); | |

} | |

return total; | |

} | |

/* Total sum of values added during the given interval (all buckets). */ | |

ValueType sum(TimeType start, TimeType end) const { | |

ValueType total = ValueType(); | |

for (unsigned int b = 0; b < buckets_.getNumBuckets(); ++b) { | |

total += buckets_.getByIndex(b).sum(start, end); | |

} | |

return total; | |

} | |

/* Average of values at the given timeseries level (all buckets). */ | |

template <typename ReturnType=double> | |

ReturnType avg(int level) const; | |

/* Average of values added during the given interval (all buckets). */ | |

template <typename ReturnType=double> | |

ReturnType avg(TimeType start, TimeType end) const; | |

/* | |

* Rate at the given timeseries level (all buckets). | |

* This is the sum of all values divided by the time interval (in seconds). | |

*/ | |

ValueType rate(int level) const; | |

/* | |

* Rate for the given interval (all buckets). | |

* This is the sum of all values divided by the time interval (in seconds). | |

*/ | |

template <typename ReturnType=double> | |

ReturnType rate(TimeType start, TimeType end) const; | |

/* | |

* Update every underlying timeseries object with the given timestamp. You | |

* must call this directly before querying to ensure that the data in all | |

* buckets is decayed properly. | |

*/ | |

void update(TimeType now); | |

/* clear all the data from the histogram. */ | |

void clear(); | |

/* Add a value into the histogram with timestamp 'now' */ | |

void addValue(TimeType now, const ValueType& value); | |

/* Add a value the given number of times with timestamp 'now' */ | |

void addValue(TimeType now, const ValueType& value, int64_t times); | |

/* | |

* Add all of the values from the specified histogram. | |

* | |

* All of the values will be added to the current time-slot. | |

* | |

* One use of this is for thread-local caching of frequently updated | |

* histogram data. For example, each thread can store a thread-local | |

* Histogram that is updated frequently, and only add it to the global | |

* TimeseriesHistogram once a second. | |

*/ | |

void addValues(TimeType now, const folly::Histogram<ValueType>& values); | |

/* | |

* Return an estimate of the value at the given percentile in the histogram | |

* in the given timeseries level. The percentile is estimated as follows: | |

* | |

* - We retrieve a count of the values in each bucket (at the given level) | |

* - We determine via the counts which bucket the given percentile falls in. | |

* - We assume the average value in the bucket is also its median | |

* - We then linearly interpolate within the bucket, by assuming that the | |

* distribution is uniform in the two value ranges [left, median) and | |

* [median, right) where [left, right) is the bucket value range. | |

* | |

* Caveats: | |

* - If the histogram is empty, this always returns ValueType(), usually 0. | |

* - For the 'under' and 'over' special buckets, their range is unbounded | |

* on one side. In order for the interpolation to work, we assume that | |

* the average value in the bucket is equidistant from the two edges of | |

* the bucket. In other words, we assume that the distance between the | |

* average and the known bound is equal to the distance between the average | |

* and the unknown bound. | |

*/ | |

ValueType getPercentileEstimate(double pct, int level) const; | |

/* | |

* Return an estimate of the value at the given percentile in the histogram | |

* in the given historical interval. Please see the documentation for | |

* getPercentileEstimate(int pct, int level) for the explanation of the | |

* estimation algorithm. | |

*/ | |

ValueType getPercentileEstimate(double pct, TimeType start, TimeType end) | |

const; | |

/* | |

* Return the bucket index that the given percentile falls into (in the | |

* given timeseries level). This index can then be used to retrieve either | |

* the bucket threshold, or other data from inside the bucket. | |

*/ | |

int getPercentileBucketIdx(double pct, int level) const; | |

/* | |

* Return the bucket index that the given percentile falls into (in the | |

* given historical interval). This index can then be used to retrieve either | |

* the bucket threshold, or other data from inside the bucket. | |

*/ | |

int getPercentileBucketIdx(double pct, TimeType start, TimeType end) const; | |

/* Get the bucket threshold for the bucket containing the given pct. */ | |

int getPercentileBucketMin(double pct, int level) const { | |

return getBucketMin(getPercentileBucketIdx(pct, level)); | |

} | |

/* Get the bucket threshold for the bucket containing the given pct. */ | |

int getPercentileBucketMin(double pct, TimeType start, TimeType end) const { | |

return getBucketMin(getPercentileBucketIdx(pct, start, end)); | |

} | |

/* | |

* Print out serialized data from all buckets at the given level. | |

* Format is: BUCKET [',' BUCKET ...] | |

* Where: BUCKET == bucketMin ':' count ':' avg | |

*/ | |

std::string getString(int level) const; | |

/* | |

* Print out serialized data for all buckets in the historical interval. | |

* For format, please see getString(int level). | |

*/ | |

std::string getString(TimeType start, TimeType end) const; | |

private: | |

typedef ContainerType Bucket; | |

struct CountFromLevel { | |

explicit CountFromLevel(int level) : level_(level) {} | |

uint64_t operator()(const ContainerType& bucket) const { | |

return bucket.count(level_); | |

} | |

private: | |

int level_; | |

}; | |

struct CountFromInterval { | |

explicit CountFromInterval(TimeType start, TimeType end) | |

: start_(start), | |

end_(end) {} | |

uint64_t operator()(const ContainerType& bucket) const { | |

return bucket.count(start_, end_); | |

} | |

private: | |

TimeType start_; | |

TimeType end_; | |

}; | |

struct AvgFromLevel { | |

explicit AvgFromLevel(int level) : level_(level) {} | |

ValueType operator()(const ContainerType& bucket) const { | |

return bucket.template avg<ValueType>(level_); | |

} | |

private: | |

int level_; | |

}; | |

template <typename ReturnType> | |

struct AvgFromInterval { | |

explicit AvgFromInterval(TimeType start, TimeType end) | |

: start_(start), | |

end_(end) {} | |

ReturnType operator()(const ContainerType& bucket) const { | |

return bucket.template avg<ReturnType>(start_, end_); | |

} | |

private: | |

TimeType start_; | |

TimeType end_; | |

}; | |

/* | |

* Special logic for the case of only one unique value registered | |

* (this can happen when clients don't pick good bucket ranges or have | |

* other bugs). It's a lot easier for clients to track down these issues | |

* if they are getting the correct value. | |

*/ | |

void maybeHandleSingleUniqueValue(const ValueType& value); | |

folly::detail::HistogramBuckets<ValueType, ContainerType> buckets_; | |

bool haveNotSeenValue_; | |

bool singleUniqueValue_; | |

ValueType firstValue_; | |

}; | |

} // folly | |

#endif // FOLLY_TIMESERIES_HISTOGRAM_H_ |