blob: b2e6006ab304da23eade64e2239d31e007621089 [file] [log] [blame]
/*
* Copyright (C) 2013 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "third_party/blink/renderer/modules/speech/speech_synthesis.h"
#include "build/build_config.h"
#include "third_party/blink/public/common/browser_interface_broker_proxy.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_metric_builder.h"
#include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_token.h"
#include "third_party/blink/public/common/privacy_budget/identifiable_token_builder.h"
#include "third_party/blink/public/common/thread_safe_browser_interface_broker_proxy.h"
#include "third_party/blink/public/platform/platform.h"
#include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_error_event_init.h"
#include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_event_init.h"
#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/frame/deprecation.h"
#include "third_party/blink/renderer/core/frame/local_dom_window.h"
#include "third_party/blink/renderer/core/frame/web_feature.h"
#include "third_party/blink/renderer/core/html/media/autoplay_policy.h"
#include "third_party/blink/renderer/core/timing/dom_window_performance.h"
#include "third_party/blink/renderer/core/timing/performance.h"
#include "third_party/blink/renderer/modules/speech/speech_synthesis_error_event.h"
#include "third_party/blink/renderer/modules/speech/speech_synthesis_event.h"
#include "third_party/blink/renderer/modules/speech/speech_synthesis_voice.h"
#include "third_party/blink/renderer/platform/instrumentation/use_counter.h"
#include "third_party/blink/renderer/platform/privacy_budget/identifiability_digest_helpers.h"
namespace blink {
const char SpeechSynthesis::kSupplementName[] = "SpeechSynthesis";
SpeechSynthesis* SpeechSynthesis::speechSynthesis(LocalDOMWindow& window) {
SpeechSynthesis* synthesis =
Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window);
if (!synthesis) {
synthesis = MakeGarbageCollected<SpeechSynthesis>(window);
ProvideTo(window, synthesis);
#if defined(OS_ANDROID)
// On Android devices we lazily initialize |mojom_synthesis_| to avoid
// needlessly binding to the TTS service, see https://crbug.com/811929.
// TODO(crbug/811929): Consider moving this logic into the Android-
// specific backend implementation.
#else
ignore_result(synthesis->TryEnsureMojomSynthesis());
#endif
}
return synthesis;
}
void SpeechSynthesis::CreateForTesting(
LocalDOMWindow& window,
mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
DCHECK(!Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window));
SpeechSynthesis* synthesis = MakeGarbageCollected<SpeechSynthesis>(window);
ProvideTo(window, synthesis);
synthesis->SetMojomSynthesisForTesting(std::move(mojom_synthesis));
}
SpeechSynthesis::SpeechSynthesis(LocalDOMWindow& window)
: Supplement<LocalDOMWindow>(window),
receiver_(this, &window),
mojom_synthesis_(&window) {}
void SpeechSynthesis::OnSetVoiceList(
Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices) {
voice_list_.clear();
for (auto& mojom_voice : mojom_voices) {
voice_list_.push_back(
MakeGarbageCollected<SpeechSynthesisVoice>(std::move(mojom_voice)));
}
VoicesDidChange();
}
const HeapVector<Member<SpeechSynthesisVoice>>& SpeechSynthesis::getVoices() {
// Kick off initialization here to ensure voice list gets populated.
ignore_result(TryEnsureMojomSynthesis());
RecordVoicesForIdentifiability();
return voice_list_;
}
void SpeechSynthesis::RecordVoicesForIdentifiability() const {
constexpr IdentifiableSurface surface = IdentifiableSurface::FromTypeAndToken(
IdentifiableSurface::Type::kWebFeature,
WebFeature::kSpeechSynthesis_GetVoices_Method);
if (!IdentifiabilityStudySettings::Get()->ShouldSample(surface))
return;
if (!GetSupplementable()->GetFrame())
return;
IdentifiableTokenBuilder builder;
for (const auto& voice : voice_list_) {
builder.AddToken(IdentifiabilityBenignStringToken(voice->voiceURI()));
builder.AddToken(IdentifiabilityBenignStringToken(voice->lang()));
builder.AddToken(IdentifiabilityBenignStringToken(voice->name()));
builder.AddToken(voice->localService());
}
IdentifiabilityMetricBuilder(GetSupplementable()->UkmSourceID())
.Set(surface, builder.GetToken())
.Record(GetSupplementable()->UkmRecorder());
}
bool SpeechSynthesis::speaking() const {
// If we have a current speech utterance, then that means we're assumed to be
// in a speaking state. This state is independent of whether the utterance
// happens to be paused.
return CurrentSpeechUtterance();
}
bool SpeechSynthesis::pending() const {
// This is true if there are any utterances that have not started.
// That means there will be more than one in the queue.
return utterance_queue_.size() > 1;
}
bool SpeechSynthesis::paused() const {
return is_paused_;
}
void SpeechSynthesis::speak(ScriptState* script_state,
SpeechSynthesisUtterance* utterance) {
DCHECK(utterance);
if (!script_state->ContextIsValid())
return;
// Note: Non-UseCounter based TTS metrics are of the form TextToSpeech.* and
// are generally global, whereas these are scoped to a single page load.
UseCounter::Count(GetSupplementable(), WebFeature::kTextToSpeech_Speak);
GetSupplementable()->CountUseOnlyInCrossOriginIframe(
WebFeature::kTextToSpeech_SpeakCrossOrigin);
if (!IsAllowedToStartByAutoplay()) {
Deprecation::CountDeprecation(
GetSupplementable(),
WebFeature::kTextToSpeech_SpeakDisallowedByAutoplay);
FireErrorEvent(utterance, 0 /* char_index */, "not-allowed");
return;
}
utterance_queue_.push_back(utterance);
// If the queue was empty, speak this immediately.
if (utterance_queue_.size() == 1)
StartSpeakingImmediately();
}
void SpeechSynthesis::cancel() {
// Remove all the items from the utterance queue. The platform
// may still have references to some of these utterances and may
// fire events on them asynchronously.
utterance_queue_.clear();
if (mojom::blink::SpeechSynthesis* mojom_synthesis =
TryEnsureMojomSynthesis())
mojom_synthesis->Cancel();
}
void SpeechSynthesis::pause() {
if (is_paused_)
return;
if (mojom::blink::SpeechSynthesis* mojom_synthesis =
TryEnsureMojomSynthesis())
mojom_synthesis->Pause();
}
void SpeechSynthesis::resume() {
if (!CurrentSpeechUtterance())
return;
if (mojom::blink::SpeechSynthesis* mojom_synthesis =
TryEnsureMojomSynthesis())
mojom_synthesis->Resume();
}
void SpeechSynthesis::DidStartSpeaking(SpeechSynthesisUtterance* utterance) {
FireEvent(event_type_names::kStart, utterance, 0, 0, String());
}
void SpeechSynthesis::DidPauseSpeaking(SpeechSynthesisUtterance* utterance) {
is_paused_ = true;
FireEvent(event_type_names::kPause, utterance, 0, 0, String());
}
void SpeechSynthesis::DidResumeSpeaking(SpeechSynthesisUtterance* utterance) {
is_paused_ = false;
FireEvent(event_type_names::kResume, utterance, 0, 0, String());
}
void SpeechSynthesis::DidFinishSpeaking(SpeechSynthesisUtterance* utterance) {
HandleSpeakingCompleted(utterance, false);
}
void SpeechSynthesis::SpeakingErrorOccurred(
SpeechSynthesisUtterance* utterance) {
HandleSpeakingCompleted(utterance, true);
}
void SpeechSynthesis::WordBoundaryEventOccurred(
SpeechSynthesisUtterance* utterance,
unsigned char_index,
unsigned char_length) {
DEFINE_STATIC_LOCAL(const String, word_boundary_string, ("word"));
FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
word_boundary_string);
}
void SpeechSynthesis::SentenceBoundaryEventOccurred(
SpeechSynthesisUtterance* utterance,
unsigned char_index,
unsigned char_length) {
DEFINE_STATIC_LOCAL(const String, sentence_boundary_string, ("sentence"));
FireEvent(event_type_names::kBoundary, utterance, char_index, char_length,
sentence_boundary_string);
}
void SpeechSynthesis::VoicesDidChange() {
if (GetSupplementable()->GetFrame())
DispatchEvent(*Event::Create(event_type_names::kVoiceschanged));
}
void SpeechSynthesis::StartSpeakingImmediately() {
SpeechSynthesisUtterance* utterance = CurrentSpeechUtterance();
DCHECK(utterance);
double millis;
if (!GetElapsedTimeMillis(&millis))
return;
utterance->SetStartTime(millis / 1000.0);
is_paused_ = false;
if (TryEnsureMojomSynthesis())
utterance->Start(this);
}
void SpeechSynthesis::HandleSpeakingCompleted(
SpeechSynthesisUtterance* utterance,
bool error_occurred) {
DCHECK(utterance);
bool should_start_speaking = false;
// If the utterance that completed was the one we're currently speaking,
// remove it from the queue and start speaking the next one.
if (utterance == CurrentSpeechUtterance()) {
utterance_queue_.pop_front();
should_start_speaking = !utterance_queue_.empty();
}
// Always fire the event, because the platform may have asynchronously
// sent an event on an utterance before it got the message that we
// canceled it, and we should always report to the user what actually
// happened.
if (error_occurred) {
// TODO(csharrison): Actually pass the correct message. For now just use a
// generic error.
FireErrorEvent(utterance, 0, "synthesis-failed");
} else {
FireEvent(event_type_names::kEnd, utterance, 0, 0, String());
}
// Start the next utterance if we just finished one and one was pending.
if (should_start_speaking && !utterance_queue_.IsEmpty())
StartSpeakingImmediately();
}
void SpeechSynthesis::FireEvent(const AtomicString& type,
SpeechSynthesisUtterance* utterance,
uint32_t char_index,
uint32_t char_length,
const String& name) {
double millis;
if (!GetElapsedTimeMillis(&millis))
return;
SpeechSynthesisEventInit* init = SpeechSynthesisEventInit::Create();
init->setUtterance(utterance);
init->setCharIndex(char_index);
init->setCharLength(char_length);
init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
init->setName(name);
utterance->DispatchEvent(*SpeechSynthesisEvent::Create(type, init));
}
void SpeechSynthesis::FireErrorEvent(SpeechSynthesisUtterance* utterance,
uint32_t char_index,
const String& error) {
double millis;
if (!GetElapsedTimeMillis(&millis))
return;
SpeechSynthesisErrorEventInit* init = SpeechSynthesisErrorEventInit::Create();
init->setUtterance(utterance);
init->setCharIndex(char_index);
init->setElapsedTime(millis - (utterance->StartTime() * 1000.0));
init->setError(error);
utterance->DispatchEvent(
*SpeechSynthesisErrorEvent::Create(event_type_names::kError, init));
}
SpeechSynthesisUtterance* SpeechSynthesis::CurrentSpeechUtterance() const {
if (utterance_queue_.IsEmpty())
return nullptr;
return utterance_queue_.front();
}
ExecutionContext* SpeechSynthesis::GetExecutionContext() const {
return GetSupplementable();
}
void SpeechSynthesis::Trace(Visitor* visitor) const {
visitor->Trace(receiver_);
visitor->Trace(mojom_synthesis_);
visitor->Trace(voice_list_);
visitor->Trace(utterance_queue_);
Supplement<LocalDOMWindow>::Trace(visitor);
EventTargetWithInlineData::Trace(visitor);
}
bool SpeechSynthesis::GetElapsedTimeMillis(double* millis) {
if (!GetSupplementable()->GetFrame())
return false;
if (GetSupplementable()->document()->IsStopped())
return false;
*millis = DOMWindowPerformance::performance(*GetSupplementable())->now();
return true;
}
bool SpeechSynthesis::IsAllowedToStartByAutoplay() const {
Document* document = GetSupplementable()->document();
DCHECK(document);
// Note: could check the utterance->volume here, but that could be overriden
// in the case of SSML.
if (AutoplayPolicy::GetAutoplayPolicyForDocument(*document) !=
AutoplayPolicy::Type::kDocumentUserActivationRequired) {
return true;
}
return AutoplayPolicy::IsDocumentAllowedToPlay(*document);
}
void SpeechSynthesis::SetMojomSynthesisForTesting(
mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) {
mojom_synthesis_.Bind(
std::move(mojom_synthesis),
GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI));
receiver_.reset();
mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI)));
}
mojom::blink::SpeechSynthesis* SpeechSynthesis::TryEnsureMojomSynthesis() {
if (mojom_synthesis_.is_bound())
return mojom_synthesis_.get();
// The frame could be detached. In that case, calls on mojom_synthesis_ will
// just get dropped. That's okay and is simpler than having to null-check
// mojom_synthesis_ before each use.
LocalDOMWindow* window = GetSupplementable();
if (!window->GetFrame())
return nullptr;
auto receiver = mojom_synthesis_.BindNewPipeAndPassReceiver(
window->GetTaskRunner(TaskType::kMiscPlatformAPI));
window->GetBrowserInterfaceBroker().GetInterface(std::move(receiver));
mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote(
window->GetTaskRunner(TaskType::kMiscPlatformAPI)));
return mojom_synthesis_.get();
}
const AtomicString& SpeechSynthesis::InterfaceName() const {
return event_target_names::kSpeechSynthesis;
}
} // namespace blink