| /* |
| * Copyright (C) 2013 Apple Inc. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "third_party/blink/renderer/modules/speech/speech_synthesis.h" |
| |
| #include "build/build_config.h" |
| #include "third_party/blink/public/common/browser_interface_broker_proxy.h" |
| #include "third_party/blink/public/common/privacy_budget/identifiability_metric_builder.h" |
| #include "third_party/blink/public/common/privacy_budget/identifiability_study_settings.h" |
| #include "third_party/blink/public/common/privacy_budget/identifiable_token.h" |
| #include "third_party/blink/public/common/privacy_budget/identifiable_token_builder.h" |
| #include "third_party/blink/public/common/thread_safe_browser_interface_broker_proxy.h" |
| #include "third_party/blink/public/platform/platform.h" |
| #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_error_event_init.h" |
| #include "third_party/blink/renderer/bindings/modules/v8/v8_speech_synthesis_event_init.h" |
| #include "third_party/blink/renderer/core/dom/document.h" |
| #include "third_party/blink/renderer/core/frame/deprecation.h" |
| #include "third_party/blink/renderer/core/frame/local_dom_window.h" |
| #include "third_party/blink/renderer/core/frame/web_feature.h" |
| #include "third_party/blink/renderer/core/html/media/autoplay_policy.h" |
| #include "third_party/blink/renderer/core/timing/dom_window_performance.h" |
| #include "third_party/blink/renderer/core/timing/performance.h" |
| #include "third_party/blink/renderer/modules/speech/speech_synthesis_error_event.h" |
| #include "third_party/blink/renderer/modules/speech/speech_synthesis_event.h" |
| #include "third_party/blink/renderer/modules/speech/speech_synthesis_voice.h" |
| #include "third_party/blink/renderer/platform/instrumentation/use_counter.h" |
| #include "third_party/blink/renderer/platform/privacy_budget/identifiability_digest_helpers.h" |
| |
| namespace blink { |
| |
| const char SpeechSynthesis::kSupplementName[] = "SpeechSynthesis"; |
| |
| SpeechSynthesis* SpeechSynthesis::speechSynthesis(LocalDOMWindow& window) { |
| SpeechSynthesis* synthesis = |
| Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window); |
| if (!synthesis) { |
| synthesis = MakeGarbageCollected<SpeechSynthesis>(window); |
| ProvideTo(window, synthesis); |
| #if defined(OS_ANDROID) |
| // On Android devices we lazily initialize |mojom_synthesis_| to avoid |
| // needlessly binding to the TTS service, see https://crbug.com/811929. |
| // TODO(crbug/811929): Consider moving this logic into the Android- |
| // specific backend implementation. |
| #else |
| ignore_result(synthesis->TryEnsureMojomSynthesis()); |
| #endif |
| } |
| return synthesis; |
| } |
| |
| void SpeechSynthesis::CreateForTesting( |
| LocalDOMWindow& window, |
| mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) { |
| DCHECK(!Supplement<LocalDOMWindow>::From<SpeechSynthesis>(window)); |
| SpeechSynthesis* synthesis = MakeGarbageCollected<SpeechSynthesis>(window); |
| ProvideTo(window, synthesis); |
| synthesis->SetMojomSynthesisForTesting(std::move(mojom_synthesis)); |
| } |
| |
| SpeechSynthesis::SpeechSynthesis(LocalDOMWindow& window) |
| : Supplement<LocalDOMWindow>(window), |
| receiver_(this, &window), |
| mojom_synthesis_(&window) {} |
| |
| void SpeechSynthesis::OnSetVoiceList( |
| Vector<mojom::blink::SpeechSynthesisVoicePtr> mojom_voices) { |
| voice_list_.clear(); |
| for (auto& mojom_voice : mojom_voices) { |
| voice_list_.push_back( |
| MakeGarbageCollected<SpeechSynthesisVoice>(std::move(mojom_voice))); |
| } |
| VoicesDidChange(); |
| } |
| |
| const HeapVector<Member<SpeechSynthesisVoice>>& SpeechSynthesis::getVoices() { |
| // Kick off initialization here to ensure voice list gets populated. |
| ignore_result(TryEnsureMojomSynthesis()); |
| RecordVoicesForIdentifiability(); |
| return voice_list_; |
| } |
| |
| void SpeechSynthesis::RecordVoicesForIdentifiability() const { |
| constexpr IdentifiableSurface surface = IdentifiableSurface::FromTypeAndToken( |
| IdentifiableSurface::Type::kWebFeature, |
| WebFeature::kSpeechSynthesis_GetVoices_Method); |
| if (!IdentifiabilityStudySettings::Get()->ShouldSample(surface)) |
| return; |
| if (!GetSupplementable()->GetFrame()) |
| return; |
| |
| IdentifiableTokenBuilder builder; |
| for (const auto& voice : voice_list_) { |
| builder.AddToken(IdentifiabilityBenignStringToken(voice->voiceURI())); |
| builder.AddToken(IdentifiabilityBenignStringToken(voice->lang())); |
| builder.AddToken(IdentifiabilityBenignStringToken(voice->name())); |
| builder.AddToken(voice->localService()); |
| } |
| IdentifiabilityMetricBuilder(GetSupplementable()->UkmSourceID()) |
| .Set(surface, builder.GetToken()) |
| .Record(GetSupplementable()->UkmRecorder()); |
| } |
| |
| bool SpeechSynthesis::speaking() const { |
| // If we have a current speech utterance, then that means we're assumed to be |
| // in a speaking state. This state is independent of whether the utterance |
| // happens to be paused. |
| return CurrentSpeechUtterance(); |
| } |
| |
| bool SpeechSynthesis::pending() const { |
| // This is true if there are any utterances that have not started. |
| // That means there will be more than one in the queue. |
| return utterance_queue_.size() > 1; |
| } |
| |
| bool SpeechSynthesis::paused() const { |
| return is_paused_; |
| } |
| |
| void SpeechSynthesis::speak(ScriptState* script_state, |
| SpeechSynthesisUtterance* utterance) { |
| DCHECK(utterance); |
| if (!script_state->ContextIsValid()) |
| return; |
| |
| // Note: Non-UseCounter based TTS metrics are of the form TextToSpeech.* and |
| // are generally global, whereas these are scoped to a single page load. |
| UseCounter::Count(GetSupplementable(), WebFeature::kTextToSpeech_Speak); |
| GetSupplementable()->CountUseOnlyInCrossOriginIframe( |
| WebFeature::kTextToSpeech_SpeakCrossOrigin); |
| if (!IsAllowedToStartByAutoplay()) { |
| Deprecation::CountDeprecation( |
| GetSupplementable(), |
| WebFeature::kTextToSpeech_SpeakDisallowedByAutoplay); |
| FireErrorEvent(utterance, 0 /* char_index */, "not-allowed"); |
| return; |
| } |
| |
| utterance_queue_.push_back(utterance); |
| |
| // If the queue was empty, speak this immediately. |
| if (utterance_queue_.size() == 1) |
| StartSpeakingImmediately(); |
| } |
| |
| void SpeechSynthesis::cancel() { |
| // Remove all the items from the utterance queue. The platform |
| // may still have references to some of these utterances and may |
| // fire events on them asynchronously. |
| utterance_queue_.clear(); |
| |
| if (mojom::blink::SpeechSynthesis* mojom_synthesis = |
| TryEnsureMojomSynthesis()) |
| mojom_synthesis->Cancel(); |
| } |
| |
| void SpeechSynthesis::pause() { |
| if (is_paused_) |
| return; |
| |
| if (mojom::blink::SpeechSynthesis* mojom_synthesis = |
| TryEnsureMojomSynthesis()) |
| mojom_synthesis->Pause(); |
| } |
| |
| void SpeechSynthesis::resume() { |
| if (!CurrentSpeechUtterance()) |
| return; |
| |
| if (mojom::blink::SpeechSynthesis* mojom_synthesis = |
| TryEnsureMojomSynthesis()) |
| mojom_synthesis->Resume(); |
| } |
| |
| void SpeechSynthesis::DidStartSpeaking(SpeechSynthesisUtterance* utterance) { |
| FireEvent(event_type_names::kStart, utterance, 0, 0, String()); |
| } |
| |
| void SpeechSynthesis::DidPauseSpeaking(SpeechSynthesisUtterance* utterance) { |
| is_paused_ = true; |
| FireEvent(event_type_names::kPause, utterance, 0, 0, String()); |
| } |
| |
| void SpeechSynthesis::DidResumeSpeaking(SpeechSynthesisUtterance* utterance) { |
| is_paused_ = false; |
| FireEvent(event_type_names::kResume, utterance, 0, 0, String()); |
| } |
| |
| void SpeechSynthesis::DidFinishSpeaking(SpeechSynthesisUtterance* utterance) { |
| HandleSpeakingCompleted(utterance, false); |
| } |
| |
| void SpeechSynthesis::SpeakingErrorOccurred( |
| SpeechSynthesisUtterance* utterance) { |
| HandleSpeakingCompleted(utterance, true); |
| } |
| |
| void SpeechSynthesis::WordBoundaryEventOccurred( |
| SpeechSynthesisUtterance* utterance, |
| unsigned char_index, |
| unsigned char_length) { |
| DEFINE_STATIC_LOCAL(const String, word_boundary_string, ("word")); |
| FireEvent(event_type_names::kBoundary, utterance, char_index, char_length, |
| word_boundary_string); |
| } |
| |
| void SpeechSynthesis::SentenceBoundaryEventOccurred( |
| SpeechSynthesisUtterance* utterance, |
| unsigned char_index, |
| unsigned char_length) { |
| DEFINE_STATIC_LOCAL(const String, sentence_boundary_string, ("sentence")); |
| FireEvent(event_type_names::kBoundary, utterance, char_index, char_length, |
| sentence_boundary_string); |
| } |
| |
| void SpeechSynthesis::VoicesDidChange() { |
| if (GetSupplementable()->GetFrame()) |
| DispatchEvent(*Event::Create(event_type_names::kVoiceschanged)); |
| } |
| |
| void SpeechSynthesis::StartSpeakingImmediately() { |
| SpeechSynthesisUtterance* utterance = CurrentSpeechUtterance(); |
| DCHECK(utterance); |
| |
| double millis; |
| if (!GetElapsedTimeMillis(&millis)) |
| return; |
| |
| utterance->SetStartTime(millis / 1000.0); |
| is_paused_ = false; |
| |
| if (TryEnsureMojomSynthesis()) |
| utterance->Start(this); |
| } |
| |
| void SpeechSynthesis::HandleSpeakingCompleted( |
| SpeechSynthesisUtterance* utterance, |
| bool error_occurred) { |
| DCHECK(utterance); |
| |
| bool should_start_speaking = false; |
| // If the utterance that completed was the one we're currently speaking, |
| // remove it from the queue and start speaking the next one. |
| if (utterance == CurrentSpeechUtterance()) { |
| utterance_queue_.pop_front(); |
| should_start_speaking = !utterance_queue_.empty(); |
| } |
| |
| // Always fire the event, because the platform may have asynchronously |
| // sent an event on an utterance before it got the message that we |
| // canceled it, and we should always report to the user what actually |
| // happened. |
| if (error_occurred) { |
| // TODO(csharrison): Actually pass the correct message. For now just use a |
| // generic error. |
| FireErrorEvent(utterance, 0, "synthesis-failed"); |
| } else { |
| FireEvent(event_type_names::kEnd, utterance, 0, 0, String()); |
| } |
| |
| // Start the next utterance if we just finished one and one was pending. |
| if (should_start_speaking && !utterance_queue_.IsEmpty()) |
| StartSpeakingImmediately(); |
| } |
| |
| void SpeechSynthesis::FireEvent(const AtomicString& type, |
| SpeechSynthesisUtterance* utterance, |
| uint32_t char_index, |
| uint32_t char_length, |
| const String& name) { |
| double millis; |
| if (!GetElapsedTimeMillis(&millis)) |
| return; |
| |
| SpeechSynthesisEventInit* init = SpeechSynthesisEventInit::Create(); |
| init->setUtterance(utterance); |
| init->setCharIndex(char_index); |
| init->setCharLength(char_length); |
| init->setElapsedTime(millis - (utterance->StartTime() * 1000.0)); |
| init->setName(name); |
| utterance->DispatchEvent(*SpeechSynthesisEvent::Create(type, init)); |
| } |
| |
| void SpeechSynthesis::FireErrorEvent(SpeechSynthesisUtterance* utterance, |
| uint32_t char_index, |
| const String& error) { |
| double millis; |
| if (!GetElapsedTimeMillis(&millis)) |
| return; |
| |
| SpeechSynthesisErrorEventInit* init = SpeechSynthesisErrorEventInit::Create(); |
| init->setUtterance(utterance); |
| init->setCharIndex(char_index); |
| init->setElapsedTime(millis - (utterance->StartTime() * 1000.0)); |
| init->setError(error); |
| utterance->DispatchEvent( |
| *SpeechSynthesisErrorEvent::Create(event_type_names::kError, init)); |
| } |
| |
| SpeechSynthesisUtterance* SpeechSynthesis::CurrentSpeechUtterance() const { |
| if (utterance_queue_.IsEmpty()) |
| return nullptr; |
| |
| return utterance_queue_.front(); |
| } |
| |
| ExecutionContext* SpeechSynthesis::GetExecutionContext() const { |
| return GetSupplementable(); |
| } |
| |
| void SpeechSynthesis::Trace(Visitor* visitor) const { |
| visitor->Trace(receiver_); |
| visitor->Trace(mojom_synthesis_); |
| visitor->Trace(voice_list_); |
| visitor->Trace(utterance_queue_); |
| Supplement<LocalDOMWindow>::Trace(visitor); |
| EventTargetWithInlineData::Trace(visitor); |
| } |
| |
| bool SpeechSynthesis::GetElapsedTimeMillis(double* millis) { |
| if (!GetSupplementable()->GetFrame()) |
| return false; |
| if (GetSupplementable()->document()->IsStopped()) |
| return false; |
| |
| *millis = DOMWindowPerformance::performance(*GetSupplementable())->now(); |
| return true; |
| } |
| |
| bool SpeechSynthesis::IsAllowedToStartByAutoplay() const { |
| Document* document = GetSupplementable()->document(); |
| DCHECK(document); |
| |
| // Note: could check the utterance->volume here, but that could be overriden |
| // in the case of SSML. |
| if (AutoplayPolicy::GetAutoplayPolicyForDocument(*document) != |
| AutoplayPolicy::Type::kDocumentUserActivationRequired) { |
| return true; |
| } |
| return AutoplayPolicy::IsDocumentAllowedToPlay(*document); |
| } |
| |
| void SpeechSynthesis::SetMojomSynthesisForTesting( |
| mojo::PendingRemote<mojom::blink::SpeechSynthesis> mojom_synthesis) { |
| mojom_synthesis_.Bind( |
| std::move(mojom_synthesis), |
| GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI)); |
| receiver_.reset(); |
| mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote( |
| GetSupplementable()->GetTaskRunner(TaskType::kMiscPlatformAPI))); |
| } |
| |
| mojom::blink::SpeechSynthesis* SpeechSynthesis::TryEnsureMojomSynthesis() { |
| if (mojom_synthesis_.is_bound()) |
| return mojom_synthesis_.get(); |
| |
| // The frame could be detached. In that case, calls on mojom_synthesis_ will |
| // just get dropped. That's okay and is simpler than having to null-check |
| // mojom_synthesis_ before each use. |
| LocalDOMWindow* window = GetSupplementable(); |
| if (!window->GetFrame()) |
| return nullptr; |
| |
| auto receiver = mojom_synthesis_.BindNewPipeAndPassReceiver( |
| window->GetTaskRunner(TaskType::kMiscPlatformAPI)); |
| |
| window->GetBrowserInterfaceBroker().GetInterface(std::move(receiver)); |
| |
| mojom_synthesis_->AddVoiceListObserver(receiver_.BindNewPipeAndPassRemote( |
| window->GetTaskRunner(TaskType::kMiscPlatformAPI))); |
| return mojom_synthesis_.get(); |
| } |
| |
| const AtomicString& SpeechSynthesis::InterfaceName() const { |
| return event_target_names::kSpeechSynthesis; |
| } |
| |
| } // namespace blink |