blob: 17cd34e2a83eab72fb5ef4f0083df8b5bb326430 [file] [log] [blame]
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/blink/renderer/platform/text/hyphenation/hyphenation_minikin.h"
#include <algorithm>
#include <utility>
#include "base/files/file.h"
#include "base/files/memory_mapped_file.h"
#include "base/memory/ptr_util.h"
#include "base/metrics/histogram_macros.h"
#include "base/timer/elapsed_timer.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "third_party/blink/public/common/thread_safe_browser_interface_broker_proxy.h"
#include "third_party/blink/public/mojom/hyphenation/hyphenation.mojom-blink.h"
#include "third_party/blink/public/platform/platform.h"
#include "third_party/blink/renderer/platform/text/character.h"
#include "third_party/blink/renderer/platform/text/hyphenation/hyphenator_aosp.h"
#include "third_party/blink/renderer/platform/text/layout_locale.h"
namespace blink {
namespace {
inline bool ShouldSkipLeadingChar(UChar32 c) {
if (Character::TreatAsSpace(c))
return true;
// Strip leading punctuation, defined as OP and QU line breaking classes,
// see UAX #14.
const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)
return true;
return false;
}
inline bool ShouldSkipTrailingChar(UChar32 c) {
// Strip trailing spaces, punctuation and control characters.
const int32_t gc_mask = U_GET_GC_MASK(c);
return gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK);
}
} // namespace
using Hyphenator = android::Hyphenator;
static mojo::Remote<mojom::blink::Hyphenation> ConnectToRemoteService() {
mojo::Remote<mojom::blink::Hyphenation> service;
Platform::Current()->GetBrowserInterfaceBroker()->GetInterface(
service.BindNewPipeAndPassReceiver());
return service;
}
static mojom::blink::Hyphenation* GetService() {
DEFINE_STATIC_LOCAL(mojo::Remote<mojom::blink::Hyphenation>, service,
(ConnectToRemoteService()));
return service.get();
}
bool HyphenationMinikin::OpenDictionary(const AtomicString& locale) {
mojom::blink::Hyphenation* service = GetService();
base::File file;
base::ElapsedTimer timer;
service->OpenDictionary(locale, &file);
UMA_HISTOGRAM_TIMES("Hyphenation.Open", timer.Elapsed());
return OpenDictionary(std::move(file));
}
bool HyphenationMinikin::OpenDictionary(base::File file) {
if (!file.IsValid())
return false;
if (!file_.Initialize(std::move(file))) {
DLOG(ERROR) << "mmap failed";
return false;
}
hyphenator_ = base::WrapUnique(Hyphenator::loadBinary(file_.data()));
return true;
}
StringView HyphenationMinikin::WordToHyphenate(
const StringView& text,
unsigned* num_leading_chars_out) {
if (text.Is8Bit()) {
const LChar* begin = text.Characters8();
const LChar* end = begin + text.length();
while (begin != end && ShouldSkipLeadingChar(*begin))
++begin;
while (begin != end && ShouldSkipTrailingChar(end[-1]))
--end;
*num_leading_chars_out = begin - text.Characters8();
CHECK_GE(end, begin);
return StringView(begin, end - begin);
}
const UChar* begin = text.Characters16();
int index = 0;
int len = text.length();
while (index < len) {
int next_index = index;
UChar32 c;
U16_NEXT(begin, next_index, len, c);
if (!ShouldSkipLeadingChar(c))
break;
index = next_index;
}
while (index < len) {
int prev_len = len;
UChar32 c;
U16_PREV(begin, index, prev_len, c);
if (!ShouldSkipTrailingChar(c))
break;
len = prev_len;
}
*num_leading_chars_out = index;
CHECK_GE(len, index);
return StringView(begin + index, len - index);
}
Vector<uint8_t> HyphenationMinikin::Hyphenate(const StringView& text) const {
Vector<uint8_t> result;
if (text.Is8Bit()) {
String text16_bit = text.ToString();
text16_bit.Ensure16Bit();
hyphenator_->hyphenate(
&result, reinterpret_cast<const uint16_t*>(text16_bit.Characters16()),
text16_bit.length());
} else {
hyphenator_->hyphenate(
&result, reinterpret_cast<const uint16_t*>(text.Characters16()),
text.length());
}
return result;
}
wtf_size_t HyphenationMinikin::LastHyphenLocation(
const StringView& text,
wtf_size_t before_index) const {
unsigned num_leading_chars;
StringView word = WordToHyphenate(text, &num_leading_chars);
if (before_index <= num_leading_chars)
return 0;
before_index = std::min<wtf_size_t>(before_index - num_leading_chars,
word.length() - kMinimumSuffixLength);
if (word.length() < kMinimumPrefixLength + kMinimumSuffixLength ||
before_index <= kMinimumPrefixLength)
return 0;
Vector<uint8_t> result = Hyphenate(word);
CHECK_LE(before_index, result.size());
CHECK_GE(before_index, 1u);
static_assert(kMinimumPrefixLength >= 1, "|beforeIndex - 1| can underflow");
for (wtf_size_t i = before_index - 1; i >= kMinimumPrefixLength; i--) {
if (result[i])
return i + num_leading_chars;
}
return 0;
}
Vector<wtf_size_t, 8> HyphenationMinikin::HyphenLocations(
const StringView& text) const {
unsigned num_leading_chars;
StringView word = WordToHyphenate(text, &num_leading_chars);
Vector<wtf_size_t, 8> hyphen_locations;
if (word.length() < kMinimumPrefixLength + kMinimumSuffixLength)
return hyphen_locations;
Vector<uint8_t> result = Hyphenate(word);
static_assert(kMinimumPrefixLength >= 1,
"Change the 'if' above if this fails");
for (wtf_size_t i = word.length() - kMinimumSuffixLength - 1;
i >= kMinimumPrefixLength; i--) {
if (result[i])
hyphen_locations.push_back(i + num_leading_chars);
}
return hyphen_locations;
}
using LocaleMap = HashMap<AtomicString, AtomicString, CaseFoldingHash>;
static LocaleMap CreateLocaleFallbackMap() {
// This data is from CLDR, compiled by AOSP.
// https://android.googlesource.com/platform/frameworks/base/+/master/core/jni/android_text_Hyphenator.cpp
using LocaleFallback = const char * [2];
static LocaleFallback locale_fallback_data[] = {
// English locales that fall back to en-US. The data is from CLDR. It's
// all English locales,
// minus the locales whose parent is en-001 (from supplementalData.xml,
// under <parentLocales>).
{"en-AS", "en-us"}, // English (American Samoa)
{"en-GU", "en-us"}, // English (Guam)
{"en-MH", "en-us"}, // English (Marshall Islands)
{"en-MP", "en-us"}, // English (Northern Mariana Islands)
{"en-PR", "en-us"}, // English (Puerto Rico)
{"en-UM", "en-us"}, // English (United States Minor Outlying Islands)
{"en-VI", "en-us"}, // English (Virgin Islands)
// All English locales other than those falling back to en-US are mapped
// to en-GB.
{"en", "en-gb"},
// For German, we're assuming the 1996 (and later) orthography by default.
{"de", "de-1996"},
// Liechtenstein uses the Swiss hyphenation rules for the 1901
// orthography.
{"de-LI-1901", "de-ch-1901"},
// Norwegian is very probably Norwegian Bokmål.
{"no", "nb"},
// Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely
// to be mn-Cyrl.
{"mn", "mn-cyrl"}, // Mongolian
// Fall back to Ethiopic script for languages likely to be written in
// Ethiopic.
// Data is from CLDR's likelySubtags.xml.
{"am", "und-ethi"}, // Amharic
{"byn", "und-ethi"}, // Blin
{"gez", "und-ethi"}, // Geʻez
{"ti", "und-ethi"}, // Tigrinya
{"wal", "und-ethi"}, // Wolaytta
// Use Hindi as a fallback hyphenator for all languages written in
// Devanagari, etc. This makes
// sense because our Indic patterns are not really linguistic, but
// script-based.
{"und-Beng", "bn"}, // Bengali
{"und-Deva", "hi"}, // Devanagari -> Hindi
{"und-Gujr", "gu"}, // Gujarati
{"und-Guru", "pa"}, // Gurmukhi -> Punjabi
{"und-Knda", "kn"}, // Kannada
{"und-Mlym", "ml"}, // Malayalam
{"und-Orya", "or"}, // Oriya
{"und-Taml", "ta"}, // Tamil
{"und-Telu", "te"}, // Telugu
// List of locales with hyphens not to fall back.
{"de-1901", nullptr},
{"de-1996", nullptr},
{"de-ch-1901", nullptr},
{"en-gb", nullptr},
{"en-us", nullptr},
{"mn-cyrl", nullptr},
{"und-ethi", nullptr},
};
LocaleMap map;
for (const auto& it : locale_fallback_data)
map.insert(AtomicString(it[0]), it[1]);
return map;
}
// static
AtomicString HyphenationMinikin::MapLocale(const AtomicString& locale) {
DEFINE_STATIC_LOCAL(LocaleMap, locale_fallback, (CreateLocaleFallbackMap()));
for (AtomicString mapped_locale = locale;;) {
const auto& it = locale_fallback.find(mapped_locale);
if (it != locale_fallback.end()) {
if (it->value)
return it->value;
return mapped_locale;
}
const wtf_size_t last_hyphen = mapped_locale.ReverseFind('-');
if (last_hyphen == kNotFound || !last_hyphen)
return mapped_locale;
mapped_locale = AtomicString(mapped_locale.GetString().Left(last_hyphen));
}
}
scoped_refptr<Hyphenation> Hyphenation::PlatformGetHyphenation(
const AtomicString& locale) {
const AtomicString mapped_locale = HyphenationMinikin::MapLocale(locale);
if (mapped_locale.Impl() != locale.Impl())
return LayoutLocale::Get(mapped_locale)->GetHyphenation();
scoped_refptr<HyphenationMinikin> hyphenation(
base::AdoptRef(new HyphenationMinikin));
if (hyphenation->OpenDictionary(locale.LowerASCII()))
return hyphenation;
return nullptr;
}
scoped_refptr<HyphenationMinikin> HyphenationMinikin::FromFileForTesting(
base::File file) {
scoped_refptr<HyphenationMinikin> hyphenation(
base::AdoptRef(new HyphenationMinikin));
if (hyphenation->OpenDictionary(std::move(file)))
return hyphenation;
return nullptr;
}
} // namespace blink