blob: 9cc4e5e4920b939df7441b99cb6197205f3e7ff0 [file] [log] [blame]
// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_
#define THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_
#include <array>
#include "base/containers/span.h"
#include "base/strings/string_piece.h"
#include "base/sys_byteorder.h"
#include "third_party/blink/public/common/common_export.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_internal_templates.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_token.h"
namespace blink {
// Builds an IdentifiableToken incrementally.
//
// Use this when the input to a sample is a bunch of disjoint objects, or the
// sample needs to include objects that are incrementally encountered.
//
// Notes:
// * The digest returned by this class is *NOT* the same as the one
// IdentifiabilityDigestOfBytes for the same set of bytes. This is due to
// block based chaining of digests used by this class.
// IdentifiabilityDigestOfBytes and this class are *NOT* interchangeable.
//
// TODO(asanka): IdentifiabilityDigestOfBytes() and this class should
// interop better. Perhaps by making the latter use the former.
//
// * The digest returned by this class is *NOT* the same as what you would
// acquire by invoking IdentifiableToken() over the same object.
// IdentifiableToken() and this class are *NOT* interchangeable.
//
// * The digest returned by this class only depends on the cumulative sequence
// of bytes that are fed to it. The partitioning thereof is irrelevant.
//
// * This object never finalizes. Partial digests can be extracted at any
// point.
class BLINK_COMMON_EXPORT IdentifiableTokenBuilder {
public:
// Convenient alias for a span of const uint8_t.
using ByteSpan = IdentifiableToken::ByteSpan;
// Initializes an "empty" incremental digest for the purpose of constructing
// an identifiability sample.
IdentifiableTokenBuilder();
// Initializes an incremental digest and populates it with the data contained
// in |message|.
explicit IdentifiableTokenBuilder(ByteSpan message);
// Copies the intermediate state.
IdentifiableTokenBuilder(const IdentifiableTokenBuilder&);
// Feeds data contained in |buffer| to the digest.
IdentifiableTokenBuilder& AddBytes(ByteSpan buffer);
// Feeds data contained in |buffer| to the digest, but precedes the buffer
// contents with an integer indicating the length. Use this when:
//
// * |buffer| is atomic. I.e. it will always be added as a single buffer.
//
// * The boundary between |buffer| and adjacent objects cannot be uniquely
// established based on content.
//
// E.g.: Ignoring NUL terminators, the pair of strings "abcd", "efgh" will be
// assigned token as the strings "abcdefg", "h" if both are added
// individually via AddBytes(). But they will have distinct digests if
// added via AddAtomic().
//
// If the contents of the object cannot be specified in a contiguous span of
// memory, then consider adding a length directly via AddValue() prior to
// adding the contents of the buffer. Doing so will achieve the same ends as
// AddAtomic().
IdentifiableTokenBuilder& AddAtomic(ByteSpan buffer);
IdentifiableTokenBuilder& AddAtomic(base::StringPiece string) {
return AddAtomic(base::as_bytes(base::make_span(string)));
}
// Feeds the underlying value of the |token| itself to the digest. Use this
// when |token| is computed in parallel in order to preserve the ordering of
// values that were seen in a concurrent sequence that cannot be
// deterministically interleaved into the primary stream.
IdentifiableTokenBuilder& AddToken(IdentifiableToken token);
// Helper for feeding primitive types by value efficiently. Anything more
// complicated than that should be passed in as a base::span<const uint8_t>.
//
// Adds eight bytes to the digest. If the type of the value doesn't consume
// all of the bytes, pads the remainder with NUL bytes.
template <typename T,
typename std::enable_if_t<
std::is_same<T, internal::remove_cvref_t<T>>::value &&
internal::has_unique_object_representations<T>::value &&
sizeof(T) <= sizeof(uint64_t)>* = nullptr>
IdentifiableTokenBuilder& AddValue(T in) {
AlignPartialBuffer();
int64_t clean_buffer =
base::ByteSwapToLE64(internal::DigestOfObjectRepresentation(in));
return AddBytes(base::make_span(
reinterpret_cast<const uint8_t*>(&clean_buffer), sizeof(clean_buffer)));
}
// Conversion operator captures an intermediate digest.
//
// The sample captures all the data that's been fed into the digest so far,
// but doesn't finalize the digest. It is valid to continue adding data after
// constructing an intermediate sample.
//
// (google-explicit-constructor also flags user-defined conversion operators.)
// NOLINTNEXTLINE(google-explicit-constructor)
operator IdentifiableToken() const;
// Captures an intermediate digest.
//
// The sample captures all the data that's been fed into the digest so far,
// but doesn't finalize the digest. It is valid to continue adding data after
// constructing an intermediate sample.
IdentifiableToken GetToken() const;
// No comparisons.
bool operator==(const IdentifiableTokenBuilder&) const = delete;
bool operator<(const IdentifiableTokenBuilder&) const = delete;
private:
// Block size. Must be a multiple of 64. Higher block sizes consume more
// memory. The extra cost is unlikely to be worth it.
//
// Under the covers we use CityHash64. It can pretty efficiently digest
// 64-byte blocks.
static constexpr size_t kBlockSizeInBytes = 64;
// Target alignment for new buffers. This is set to 8 for all platforms and
// must always stay constant across platforms.
static constexpr size_t kBlockAlignment = 8;
// An array of exactly |kBlockSizeInBytes| bytes.
using BlockBuffer = std::array<uint8_t, kBlockSizeInBytes>;
// A view of a full block.
using ConstFullBlockSpan = base::span<const uint8_t, kBlockSizeInBytes>;
// Returns true if the partial buffer is aligned on |kBlockAlignment|
// boundary.
bool IsAligned() const;
// Appends enough NUL bytes to |partial_| until the next insertion point is
// aligned on a |kBlockAlignment| boundary.
//
// If the partial buffer is non-empty, its size is unlikely to be aligned at
// machine word boundary. This makes subsequent append operations slow for
// data types that are already aligned.
//
// This should only be called prior to adding an atomic buffer.
void AlignPartialBuffer();
// Captures the |kBlockSizeInBytes| bytes of data in |block| into the digest.
// |block| must be exactly this many bytes.
void DigestBlock(ConstFullBlockSpan block);
// Captures as many bytes as possible from |message| into the partial block in
// |partial_|. It captures a maximum of |kBlockSizeInBytes - 1| bytes.
//
// Returns a span covering the remainder of |message| that was not consumed.
ByteSpan SkimIntoPartial(ByteSpan message);
// Returns a span for the contents of the partial block.
//
// Can be called at any point. Does not change the state of the partial
// buffer.
ByteSpan GetPartialBlock() const;
// Returns a span that includes the contents of the partial block and backed
// by |partial_|.
//
// NOTE: Should only be called once |kBlockSizeInBytes| bytes have been
// accumulated. Resets |partial_size_| upon completion.
//
// NOTE: Any subsequent AddBytes(), AddValue(), AddAtomic() calls will
// invalidate the returned FullBlock.
ConstFullBlockSpan TakeCompletedBlock();
// Size of partially filled buffer.
size_t PartialSize() const;
// Accumulates smaller pieces of data until we have a full block.
alignas(int64_t) BlockBuffer partial_;
// Next available position in `partial_`. std::array iterators are never
// invalidated.
BlockBuffer::iterator position_ = partial_.begin();
// Merkle-Damgård chaining.
uint64_t chaining_value_;
};
} // namespace blink
#endif // THIRD_PARTY_BLINK_PUBLIC_COMMON_PRIVACY_BUDGET_IDENTIFIABLE_TOKEN_BUILDER_H_