mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-02-08 22:51:34 +01:00
Improve titles, trim body to reasonable length
This commit is contained in:
parent
13eda35985
commit
93aeadbd9f
1
go.mod
1
go.mod
@ -66,6 +66,7 @@ require (
|
||||
github.com/ncruces/go-sqlite3 v0.22.0
|
||||
github.com/oklog/ulid v1.3.1
|
||||
github.com/prometheus/client_golang v1.20.5
|
||||
github.com/rivo/uniseg v0.4.7
|
||||
github.com/spf13/cobra v1.8.1
|
||||
github.com/spf13/viper v1.19.0
|
||||
github.com/stretchr/testify v1.10.0
|
||||
|
2
go.sum
generated
2
go.sum
generated
@ -476,6 +476,8 @@ github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b h1:aUNXCGgukb4gtY
|
||||
github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b/go.mod h1:wTPjTepVu7uJBYgZ0SdWHQlIas582j6cn2jgk4DDdlg=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a h1:w3tdWGKbLGBPtR/8/oO74W6hmz0qE5q0z9aqSAewaaM=
|
||||
|
@ -56,7 +56,7 @@ type Notification struct {
|
||||
NotificationPendingReply NotificationType = 10 // NotificationPendingReply -- Someone has replied to a status of yours, which requires approval by you.
|
||||
NotificationPendingReblog NotificationType = 11 // NotificationPendingReblog -- Someone has boosted a status of yours, which requires approval by you.
|
||||
NotificationAdminReport NotificationType = 12 // NotificationAdminReport -- someone has submitted a new report to the instance.
|
||||
NotificationUpdate NotificationType = 13
|
||||
NotificationUpdate NotificationType = 13 // NotificationUpdate -- someone has edited their status.
|
||||
NotificationTypeNumValues NotificationType = 14 // NotificationTypeNumValues -- 1 + number of max notification type
|
||||
)
|
||||
|
||||
|
45
internal/text/substring.go
Normal file
45
internal/text/substring.go
Normal file
@ -0,0 +1,45 @@
|
||||
// GoToSocial
|
||||
// Copyright (C) GoToSocial Authors admin@gotosocial.org
|
||||
// SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package text
|
||||
|
||||
import (
|
||||
"github.com/rivo/uniseg"
|
||||
)
|
||||
|
||||
// FirstNBytesByWords produces a prefix substring of up to n bytes from a given string, respecting Unicode grapheme and
|
||||
// word boundaries. The substring may be empty, and may include leading or trailing whitespace.
|
||||
func FirstNBytesByWords(s string, n int) string {
|
||||
substringEnd := 0
|
||||
|
||||
graphemes := uniseg.NewGraphemes(s)
|
||||
for graphemes.Next() {
|
||||
|
||||
if !graphemes.IsWordBoundary() {
|
||||
continue
|
||||
}
|
||||
|
||||
_, end := graphemes.Positions()
|
||||
if end > n {
|
||||
break
|
||||
}
|
||||
|
||||
substringEnd = end
|
||||
}
|
||||
|
||||
return s[0:substringEnd]
|
||||
}
|
47
internal/text/substring_test.go
Normal file
47
internal/text/substring_test.go
Normal file
@ -0,0 +1,47 @@
|
||||
// GoToSocial
|
||||
// Copyright (C) GoToSocial Authors admin@gotosocial.org
|
||||
// SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package text_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/suite"
|
||||
"github.com/superseriousbusiness/gotosocial/internal/text"
|
||||
)
|
||||
|
||||
type SubstringTestSuite struct {
|
||||
suite.Suite
|
||||
}
|
||||
|
||||
func (suite *SubstringTestSuite) TestText() {
|
||||
suite.Equal(
|
||||
"Sphinx of black quartz, ",
|
||||
text.FirstNBytesByWords("Sphinx of black quartz, judge my vow!", 25),
|
||||
)
|
||||
}
|
||||
|
||||
func (suite *SubstringTestSuite) TestEmoji() {
|
||||
suite.Equal(
|
||||
"🏳️⚧️ ",
|
||||
text.FirstNBytesByWords("🏳️⚧️ 🙈", 20),
|
||||
)
|
||||
}
|
||||
|
||||
func TestSubstringTestSuite(t *testing.T) {
|
||||
suite.Run(t, new(SubstringTestSuite))
|
||||
}
|
@ -23,6 +23,7 @@
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
webpushgo "github.com/SherClockHolmes/webpush-go"
|
||||
@ -59,9 +60,13 @@ func NewRealSender(httpClient *http.Client, state *state.State) Sender {
|
||||
// while waiting for the client to retrieve them.
|
||||
const TTL = 48 * time.Hour
|
||||
|
||||
// responseBodyMaxLen limits how much of the Web Push server response we use for error messages.
|
||||
// responseBodyMaxLen limits how much of the Web Push server response we read for error messages.
|
||||
const responseBodyMaxLen = 1024
|
||||
|
||||
// bodyMaxLen is a polite maximum length for a Web Push notification's body text, in bytes.
|
||||
// Note that this isn't limited per se, but Web Push servers may reject anything with a total request body size over 4k.
|
||||
const bodyMaxLen = 3000
|
||||
|
||||
func (r *realSender) Send(
|
||||
ctx context.Context,
|
||||
notification *gtsmodel.Notification,
|
||||
@ -126,7 +131,7 @@ func (r *realSender) Send(
|
||||
vapidKeyPair,
|
||||
vapidSubjectEmail,
|
||||
subscription,
|
||||
notification.TargetAccount,
|
||||
notification,
|
||||
apiNotification,
|
||||
); err != nil {
|
||||
log.Errorf(
|
||||
@ -148,7 +153,7 @@ func (r *realSender) sendToSubscription(
|
||||
vapidKeyPair *gtsmodel.VAPIDKeyPair,
|
||||
vapidSubjectEmail string,
|
||||
subscription *gtsmodel.WebPushSubscription,
|
||||
targetAccount *gtsmodel.Account,
|
||||
notification *gtsmodel.Notification,
|
||||
apiNotification *apimodel.Notification,
|
||||
) error {
|
||||
// Get the associated access token.
|
||||
@ -162,7 +167,7 @@ func (r *realSender) sendToSubscription(
|
||||
NotificationID: apiNotification.ID,
|
||||
NotificationType: apiNotification.Type,
|
||||
Icon: apiNotification.Account.Avatar,
|
||||
PreferredLocale: targetAccount.Settings.Language,
|
||||
PreferredLocale: notification.TargetAccount.Settings.Language,
|
||||
AccessToken: token.Access,
|
||||
}
|
||||
|
||||
@ -171,8 +176,45 @@ func (r *realSender) sendToSubscription(
|
||||
if displayNameOrAcct == "" {
|
||||
displayNameOrAcct = apiNotification.Account.Acct
|
||||
}
|
||||
// TODO: (Vyr) improve copy
|
||||
pushNotification.Title = fmt.Sprintf("%s from %s", apiNotification.Type, displayNameOrAcct)
|
||||
switch notification.NotificationType {
|
||||
case gtsmodel.NotificationFollow:
|
||||
pushNotification.Title = fmt.Sprintf("%s followed you", displayNameOrAcct)
|
||||
case gtsmodel.NotificationFollowRequest:
|
||||
pushNotification.Title = fmt.Sprintf("%s requested to follow you", displayNameOrAcct)
|
||||
case gtsmodel.NotificationMention:
|
||||
pushNotification.Title = fmt.Sprintf("%s mentioned you", displayNameOrAcct)
|
||||
case gtsmodel.NotificationReblog:
|
||||
pushNotification.Title = fmt.Sprintf("%s boosted your post", displayNameOrAcct)
|
||||
case gtsmodel.NotificationFavourite:
|
||||
pushNotification.Title = fmt.Sprintf("%s faved your post", displayNameOrAcct)
|
||||
case gtsmodel.NotificationPoll:
|
||||
if subscription.AccountID == notification.TargetAccountID {
|
||||
pushNotification.Title = fmt.Sprintf("Your poll has ended")
|
||||
} else {
|
||||
pushNotification.Title = fmt.Sprintf("%s's poll has ended", displayNameOrAcct)
|
||||
}
|
||||
case gtsmodel.NotificationStatus:
|
||||
pushNotification.Title = fmt.Sprintf("%s posted", displayNameOrAcct)
|
||||
case gtsmodel.NotificationAdminSignup:
|
||||
pushNotification.Title = fmt.Sprintf("%s requested to sign up", displayNameOrAcct)
|
||||
case gtsmodel.NotificationPendingFave:
|
||||
pushNotification.Title = fmt.Sprintf("%s faved your post, which requires your approval", displayNameOrAcct)
|
||||
case gtsmodel.NotificationPendingReply:
|
||||
pushNotification.Title = fmt.Sprintf("%s mentioned you, which requires your approval", displayNameOrAcct)
|
||||
case gtsmodel.NotificationPendingReblog:
|
||||
pushNotification.Title = fmt.Sprintf("%s boosted your post, which requires your approval", displayNameOrAcct)
|
||||
case gtsmodel.NotificationAdminReport:
|
||||
pushNotification.Title = fmt.Sprintf("%s submitted a report", displayNameOrAcct)
|
||||
case gtsmodel.NotificationUpdate:
|
||||
pushNotification.Title = fmt.Sprintf("%s updated their post", displayNameOrAcct)
|
||||
default:
|
||||
log.Warnf(ctx, "Unknown notification type: %d", notification.NotificationType)
|
||||
pushNotification.Title = fmt.Sprintf(
|
||||
"%s did something (unknown notification type %d)",
|
||||
displayNameOrAcct,
|
||||
notification.NotificationType,
|
||||
)
|
||||
}
|
||||
|
||||
// Set the notification body.
|
||||
if apiNotification.Status != nil {
|
||||
@ -184,7 +226,7 @@ func (r *realSender) sendToSubscription(
|
||||
} else {
|
||||
pushNotification.Body = text.SanitizeToPlaintext(apiNotification.Account.Note)
|
||||
}
|
||||
// TODO: (Vyr) trim this
|
||||
pushNotification.Body = firstNBytesTrimSpace(pushNotification.Body, bodyMaxLen)
|
||||
|
||||
// Encode the push notification as JSON.
|
||||
pushNotificationBytes, err := json.Marshal(pushNotification)
|
||||
@ -221,6 +263,7 @@ func (r *realSender) sendToSubscription(
|
||||
if resp.StatusCode < 200 || resp.StatusCode > 299 {
|
||||
if resp.StatusCode >= 400 && resp.StatusCode <= 499 &&
|
||||
resp.StatusCode != http.StatusRequestTimeout &&
|
||||
resp.StatusCode != http.StatusRequestEntityTooLarge &&
|
||||
resp.StatusCode != http.StatusTooManyRequests {
|
||||
// We should not send any more notifications to this subscription. Try to delete it.
|
||||
if err := r.state.DB.DeleteWebPushSubscriptionByTokenID(ctx, subscription.TokenID); err != nil {
|
||||
@ -256,6 +299,11 @@ func (r *realSender) sendToSubscription(
|
||||
return nil
|
||||
}
|
||||
|
||||
// firstNBytesTrimSpace returns the first N bytes of a string, trimming leading and trailing whitespace.
|
||||
func firstNBytesTrimSpace(s string, n int) string {
|
||||
return strings.TrimSpace(text.FirstNBytesByWords(strings.TrimSpace(s), n))
|
||||
}
|
||||
|
||||
// gtsHTTPClientRoundTripper helps wrap a GtS HTTP client back into a regular HTTP client,
|
||||
// so that webpush-go can use our IP filters, bad hosts list, and retries.
|
||||
type gtsHTTPClientRoundTripper struct {
|
||||
|
@ -168,7 +168,7 @@ type notifyingReadCloser struct {
|
||||
bodyClosed chan struct{}
|
||||
}
|
||||
|
||||
func (rc *notifyingReadCloser) Read(p []byte) (n int, err error) {
|
||||
func (rc *notifyingReadCloser) Read(_ []byte) (n int, err error) {
|
||||
return 0, io.EOF
|
||||
}
|
||||
|
||||
|
21
vendor/github.com/rivo/uniseg/LICENSE.txt
generated
vendored
Normal file
21
vendor/github.com/rivo/uniseg/LICENSE.txt
generated
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 Oliver Kuederle
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
137
vendor/github.com/rivo/uniseg/README.md
generated
vendored
Normal file
137
vendor/github.com/rivo/uniseg/README.md
generated
vendored
Normal file
@ -0,0 +1,137 @@
|
||||
# Unicode Text Segmentation for Go
|
||||
|
||||
[![Go Reference](https://pkg.go.dev/badge/github.com/rivo/uniseg.svg)](https://pkg.go.dev/github.com/rivo/uniseg)
|
||||
[![Go Report](https://img.shields.io/badge/go%20report-A%2B-brightgreen.svg)](https://goreportcard.com/report/github.com/rivo/uniseg)
|
||||
|
||||
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/), Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 15.0.0), and monospace font string width calculation similar to [wcwidth](https://man7.org/linux/man-pages/man3/wcwidth.3.html).
|
||||
|
||||
## Background
|
||||
|
||||
### Grapheme Clusters
|
||||
|
||||
In Go, [strings are read-only slices of bytes](https://go.dev/blog/strings). They can be turned into Unicode code points using the `for` loop or by casting: `[]rune(str)`. However, multiple code points may be combined into one user-perceived character or what the Unicode specification calls "grapheme cluster". Here are some examples:
|
||||
|
||||
|String|Bytes (UTF-8)|Code points (runes)|Grapheme clusters|
|
||||
|-|-|-|-|
|
||||
|Käse|6 bytes: `4b 61 cc 88 73 65`|5 code points: `4b 61 308 73 65`|4 clusters: `[4b],[61 308],[73],[65]`|
|
||||
|🏳️🌈|14 bytes: `f0 9f 8f b3 ef b8 8f e2 80 8d f0 9f 8c 88`|4 code points: `1f3f3 fe0f 200d 1f308`|1 cluster: `[1f3f3 fe0f 200d 1f308]`|
|
||||
|🇩🇪|8 bytes: `f0 9f 87 a9 f0 9f 87 aa`|2 code points: `1f1e9 1f1ea`|1 cluster: `[1f1e9 1f1ea]`|
|
||||
|
||||
This package provides tools to iterate over these grapheme clusters. This may be used to determine the number of user-perceived characters, to split strings in their intended places, or to extract individual characters which form a unit.
|
||||
|
||||
### Word Boundaries
|
||||
|
||||
Word boundaries are used in a number of different contexts. The most familiar ones are selection (double-click mouse selection), cursor movement ("move to next word" control-arrow keys), and the dialog option "Whole Word Search" for search and replace. They are also used in database queries, to determine whether elements are within a certain number of words of one another. Searching may also use word boundaries in determining matching items. This package provides tools to determine word boundaries within strings.
|
||||
|
||||
### Sentence Boundaries
|
||||
|
||||
Sentence boundaries are often used for triple-click or some other method of selecting or iterating through blocks of text that are larger than single words. They are also used to determine whether words occur within the same sentence in database queries. This package provides tools to determine sentence boundaries within strings.
|
||||
|
||||
### Line Breaking
|
||||
|
||||
Line breaking, also known as word wrapping, is the process of breaking a section of text into lines such that it will fit in the available width of a page, window or other display area. This package provides tools to determine where a string may or may not be broken and where it must be broken (for example after newline characters).
|
||||
|
||||
### Monospace Width
|
||||
|
||||
Most terminals or text displays / text editors using a monospace font (for example source code editors) use a fixed width for each character. Some characters such as emojis or characters found in Asian and other languages may take up more than one character cell. This package provides tools to determine the number of cells a string will take up when displayed in a monospace font. See [here](https://pkg.go.dev/github.com/rivo/uniseg#hdr-Monospace_Width) for more information.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
go get github.com/rivo/uniseg
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Counting Characters in a String
|
||||
|
||||
```go
|
||||
n := uniseg.GraphemeClusterCount("🇩🇪🏳️🌈")
|
||||
fmt.Println(n)
|
||||
// 2
|
||||
```
|
||||
|
||||
### Calculating the Monospace String Width
|
||||
|
||||
```go
|
||||
width := uniseg.StringWidth("🇩🇪🏳️🌈!")
|
||||
fmt.Println(width)
|
||||
// 5
|
||||
```
|
||||
|
||||
### Using the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) Class
|
||||
|
||||
This is the most convenient method of iterating over grapheme clusters:
|
||||
|
||||
```go
|
||||
gr := uniseg.NewGraphemes("👍🏼!")
|
||||
for gr.Next() {
|
||||
fmt.Printf("%x ", gr.Runes())
|
||||
}
|
||||
// [1f44d 1f3fc] [21]
|
||||
```
|
||||
|
||||
### Using the [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) Function
|
||||
|
||||
This avoids allocating a new `Graphemes` object but it requires the handling of states and boundaries:
|
||||
|
||||
```go
|
||||
str := "🇩🇪🏳️🌈"
|
||||
state := -1
|
||||
var c string
|
||||
for len(str) > 0 {
|
||||
c, str, _, state = uniseg.StepString(str, state)
|
||||
fmt.Printf("%x ", []rune(c))
|
||||
}
|
||||
// [1f1e9 1f1ea] [1f3f3 fe0f 200d 1f308]
|
||||
```
|
||||
|
||||
### Advanced Examples
|
||||
|
||||
The [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class offers the most convenient way to access all functionality of this package. But in some cases, it may be better to use the specialized functions directly. For example, if you're only interested in word segmentation, use [`FirstWord`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWord) or [`FirstWordInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWordInString):
|
||||
|
||||
```go
|
||||
str := "Hello, world!"
|
||||
state := -1
|
||||
var c string
|
||||
for len(str) > 0 {
|
||||
c, str, state = uniseg.FirstWordInString(str, state)
|
||||
fmt.Printf("(%s)\n", c)
|
||||
}
|
||||
// (Hello)
|
||||
// (,)
|
||||
// ( )
|
||||
// (world)
|
||||
// (!)
|
||||
```
|
||||
|
||||
Similarly, use
|
||||
|
||||
- [`FirstGraphemeCluster`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeCluster) or [`FirstGraphemeClusterInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeClusterInString) for grapheme cluster determination only,
|
||||
- [`FirstSentence`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentence) or [`FirstSentenceInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentenceInString) for sentence segmentation only, and
|
||||
- [`FirstLineSegment`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegment) or [`FirstLineSegmentInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegmentInString) for line breaking / word wrapping (although using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) is preferred as it will observe grapheme cluster boundaries).
|
||||
|
||||
If you're only interested in the width of characters, use [`FirstGraphemeCluster`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeCluster) or [`FirstGraphemeClusterInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeClusterInString). It is much faster than using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step), [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString), or the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class because it does not include the logic for word / sentence / line boundaries.
|
||||
|
||||
Finally, if you need to reverse a string while preserving grapheme clusters, use [`ReverseString`](https://pkg.go.dev/github.com/rivo/uniseg#ReverseString):
|
||||
|
||||
```go
|
||||
fmt.Println(uniseg.ReverseString("🇩🇪🏳️🌈"))
|
||||
// 🏳️🌈🇩🇪
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
Refer to https://pkg.go.dev/github.com/rivo/uniseg for the package's documentation.
|
||||
|
||||
## Dependencies
|
||||
|
||||
This package does not depend on any packages outside the standard library.
|
||||
|
||||
## Sponsor this Project
|
||||
|
||||
[Become a Sponsor on GitHub](https://github.com/sponsors/rivo?metadata_source=uniseg_readme) to support this project!
|
||||
|
||||
## Your Feedback
|
||||
|
||||
Add your issue here on GitHub, preferably before submitting any PR's. Feel free to get in touch if you have any questions.
|
108
vendor/github.com/rivo/uniseg/doc.go
generated
vendored
Normal file
108
vendor/github.com/rivo/uniseg/doc.go
generated
vendored
Normal file
@ -0,0 +1,108 @@
|
||||
/*
|
||||
Package uniseg implements Unicode Text Segmentation, Unicode Line Breaking, and
|
||||
string width calculation for monospace fonts. Unicode Text Segmentation conforms
|
||||
to Unicode Standard Annex #29 (https://unicode.org/reports/tr29/) and Unicode
|
||||
Line Breaking conforms to Unicode Standard Annex #14
|
||||
(https://unicode.org/reports/tr14/).
|
||||
|
||||
In short, using this package, you can split a string into grapheme clusters
|
||||
(what people would usually refer to as a "character"), into words, and into
|
||||
sentences. Or, in its simplest case, this package allows you to count the number
|
||||
of characters in a string, especially when it contains complex characters such
|
||||
as emojis, combining characters, or characters from Asian, Arabic, Hebrew, or
|
||||
other languages. Additionally, you can use it to implement line breaking (or
|
||||
"word wrapping"), that is, to determine where text can be broken over to the
|
||||
next line when the width of the line is not big enough to fit the entire text.
|
||||
Finally, you can use it to calculate the display width of a string for monospace
|
||||
fonts.
|
||||
|
||||
# Getting Started
|
||||
|
||||
If you just want to count the number of characters in a string, you can use
|
||||
[GraphemeClusterCount]. If you want to determine the display width of a string,
|
||||
you can use [StringWidth]. If you want to iterate over a string, you can use
|
||||
[Step], [StepString], or the [Graphemes] class (more convenient but less
|
||||
performant). This will provide you with all information: grapheme clusters,
|
||||
word boundaries, sentence boundaries, line breaks, and monospace character
|
||||
widths. The specialized functions [FirstGraphemeCluster],
|
||||
[FirstGraphemeClusterInString], [FirstWord], [FirstWordInString],
|
||||
[FirstSentence], and [FirstSentenceInString] can be used if only one type of
|
||||
information is needed.
|
||||
|
||||
# Grapheme Clusters
|
||||
|
||||
Consider the rainbow flag emoji: 🏳️🌈. On most modern systems, it appears as one
|
||||
character. But its string representation actually has 14 bytes, so counting
|
||||
bytes (or using len("🏳️🌈")) will not work as expected. Counting runes won't,
|
||||
either: The flag has 4 Unicode code points, thus 4 runes. The stdlib function
|
||||
utf8.RuneCountInString("🏳️🌈") and len([]rune("🏳️🌈")) will both return 4.
|
||||
|
||||
The [GraphemeClusterCount] function will return 1 for the rainbow flag emoji.
|
||||
The Graphemes class and a variety of functions in this package will allow you to
|
||||
split strings into its grapheme clusters.
|
||||
|
||||
# Word Boundaries
|
||||
|
||||
Word boundaries are used in a number of different contexts. The most familiar
|
||||
ones are selection (double-click mouse selection), cursor movement ("move to
|
||||
next word" control-arrow keys), and the dialog option "Whole Word Search" for
|
||||
search and replace. This package provides methods for determining word
|
||||
boundaries.
|
||||
|
||||
# Sentence Boundaries
|
||||
|
||||
Sentence boundaries are often used for triple-click or some other method of
|
||||
selecting or iterating through blocks of text that are larger than single words.
|
||||
They are also used to determine whether words occur within the same sentence in
|
||||
database queries. This package provides methods for determining sentence
|
||||
boundaries.
|
||||
|
||||
# Line Breaking
|
||||
|
||||
Line breaking, also known as word wrapping, is the process of breaking a section
|
||||
of text into lines such that it will fit in the available width of a page,
|
||||
window or other display area. This package provides methods to determine the
|
||||
positions in a string where a line must be broken, may be broken, or must not be
|
||||
broken.
|
||||
|
||||
# Monospace Width
|
||||
|
||||
Monospace width, as referred to in this package, is the width of a string in a
|
||||
monospace font. This is commonly used in terminal user interfaces or text
|
||||
displays or editors that don't support proportional fonts. A width of 1
|
||||
corresponds to a single character cell. The C function [wcswidth()] and its
|
||||
implementation in other programming languages is in widespread use for the same
|
||||
purpose. However, there is no standard for the calculation of such widths, and
|
||||
this package differs from wcswidth() in a number of ways, presumably to generate
|
||||
more visually pleasing results.
|
||||
|
||||
To start, we assume that every code point has a width of 1, with the following
|
||||
exceptions:
|
||||
|
||||
- Code points with grapheme cluster break properties Control, CR, LF, Extend,
|
||||
and ZWJ have a width of 0.
|
||||
- U+2E3A, Two-Em Dash, has a width of 3.
|
||||
- U+2E3B, Three-Em Dash, has a width of 4.
|
||||
- Characters with the East-Asian Width properties "Fullwidth" (F) and "Wide"
|
||||
(W) have a width of 2. (Properties "Ambiguous" (A) and "Neutral" (N) both
|
||||
have a width of 1.)
|
||||
- Code points with grapheme cluster break property Regional Indicator have a
|
||||
width of 2.
|
||||
- Code points with grapheme cluster break property Extended Pictographic have
|
||||
a width of 2, unless their Emoji Presentation flag is "No", in which case
|
||||
the width is 1.
|
||||
|
||||
For Hangul grapheme clusters composed of conjoining Jamo and for Regional
|
||||
Indicators (flags), all code points except the first one have a width of 0. For
|
||||
grapheme clusters starting with an Extended Pictographic, any additional code
|
||||
point will force a total width of 2, except if the Variation Selector-15
|
||||
(U+FE0E) is included, in which case the total width is always 1. Grapheme
|
||||
clusters ending with Variation Selector-16 (U+FE0F) have a width of 2.
|
||||
|
||||
Note that whether these widths appear correct depends on your application's
|
||||
render engine, to which extent it conforms to the Unicode Standard, and its
|
||||
choice of font.
|
||||
|
||||
[wcswidth()]: https://man7.org/linux/man-pages/man3/wcswidth.3.html
|
||||
*/
|
||||
package uniseg
|
2588
vendor/github.com/rivo/uniseg/eastasianwidth.go
generated
vendored
Normal file
2588
vendor/github.com/rivo/uniseg/eastasianwidth.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
295
vendor/github.com/rivo/uniseg/emojipresentation.go
generated
vendored
Normal file
295
vendor/github.com/rivo/uniseg/emojipresentation.go
generated
vendored
Normal file
@ -0,0 +1,295 @@
|
||||
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
|
||||
|
||||
package uniseg
|
||||
|
||||
// emojiPresentation are taken from
|
||||
//
|
||||
// and
|
||||
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
|
||||
// ("Extended_Pictographic" only)
|
||||
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
|
||||
// license agreement.
|
||||
var emojiPresentation = [][3]int{
|
||||
{0x231A, 0x231B, prEmojiPresentation}, // E0.6 [2] (⌚..⌛) watch..hourglass done
|
||||
{0x23E9, 0x23EC, prEmojiPresentation}, // E0.6 [4] (⏩..⏬) fast-forward button..fast down button
|
||||
{0x23F0, 0x23F0, prEmojiPresentation}, // E0.6 [1] (⏰) alarm clock
|
||||
{0x23F3, 0x23F3, prEmojiPresentation}, // E0.6 [1] (⏳) hourglass not done
|
||||
{0x25FD, 0x25FE, prEmojiPresentation}, // E0.6 [2] (◽..◾) white medium-small square..black medium-small square
|
||||
{0x2614, 0x2615, prEmojiPresentation}, // E0.6 [2] (☔..☕) umbrella with rain drops..hot beverage
|
||||
{0x2648, 0x2653, prEmojiPresentation}, // E0.6 [12] (♈..♓) Aries..Pisces
|
||||
{0x267F, 0x267F, prEmojiPresentation}, // E0.6 [1] (♿) wheelchair symbol
|
||||
{0x2693, 0x2693, prEmojiPresentation}, // E0.6 [1] (⚓) anchor
|
||||
{0x26A1, 0x26A1, prEmojiPresentation}, // E0.6 [1] (⚡) high voltage
|
||||
{0x26AA, 0x26AB, prEmojiPresentation}, // E0.6 [2] (⚪..⚫) white circle..black circle
|
||||
{0x26BD, 0x26BE, prEmojiPresentation}, // E0.6 [2] (⚽..⚾) soccer ball..baseball
|
||||
{0x26C4, 0x26C5, prEmojiPresentation}, // E0.6 [2] (⛄..⛅) snowman without snow..sun behind cloud
|
||||
{0x26CE, 0x26CE, prEmojiPresentation}, // E0.6 [1] (⛎) Ophiuchus
|
||||
{0x26D4, 0x26D4, prEmojiPresentation}, // E0.6 [1] (⛔) no entry
|
||||
{0x26EA, 0x26EA, prEmojiPresentation}, // E0.6 [1] (⛪) church
|
||||
{0x26F2, 0x26F3, prEmojiPresentation}, // E0.6 [2] (⛲..⛳) fountain..flag in hole
|
||||
{0x26F5, 0x26F5, prEmojiPresentation}, // E0.6 [1] (⛵) sailboat
|
||||
{0x26FA, 0x26FA, prEmojiPresentation}, // E0.6 [1] (⛺) tent
|
||||
{0x26FD, 0x26FD, prEmojiPresentation}, // E0.6 [1] (⛽) fuel pump
|
||||
{0x2705, 0x2705, prEmojiPresentation}, // E0.6 [1] (✅) check mark button
|
||||
{0x270A, 0x270B, prEmojiPresentation}, // E0.6 [2] (✊..✋) raised fist..raised hand
|
||||
{0x2728, 0x2728, prEmojiPresentation}, // E0.6 [1] (✨) sparkles
|
||||
{0x274C, 0x274C, prEmojiPresentation}, // E0.6 [1] (❌) cross mark
|
||||
{0x274E, 0x274E, prEmojiPresentation}, // E0.6 [1] (❎) cross mark button
|
||||
{0x2753, 0x2755, prEmojiPresentation}, // E0.6 [3] (❓..❕) red question mark..white exclamation mark
|
||||
{0x2757, 0x2757, prEmojiPresentation}, // E0.6 [1] (❗) red exclamation mark
|
||||
{0x2795, 0x2797, prEmojiPresentation}, // E0.6 [3] (➕..➗) plus..divide
|
||||
{0x27B0, 0x27B0, prEmojiPresentation}, // E0.6 [1] (➰) curly loop
|
||||
{0x27BF, 0x27BF, prEmojiPresentation}, // E1.0 [1] (➿) double curly loop
|
||||
{0x2B1B, 0x2B1C, prEmojiPresentation}, // E0.6 [2] (⬛..⬜) black large square..white large square
|
||||
{0x2B50, 0x2B50, prEmojiPresentation}, // E0.6 [1] (⭐) star
|
||||
{0x2B55, 0x2B55, prEmojiPresentation}, // E0.6 [1] (⭕) hollow red circle
|
||||
{0x1F004, 0x1F004, prEmojiPresentation}, // E0.6 [1] (🀄) mahjong red dragon
|
||||
{0x1F0CF, 0x1F0CF, prEmojiPresentation}, // E0.6 [1] (🃏) joker
|
||||
{0x1F18E, 0x1F18E, prEmojiPresentation}, // E0.6 [1] (🆎) AB button (blood type)
|
||||
{0x1F191, 0x1F19A, prEmojiPresentation}, // E0.6 [10] (🆑..🆚) CL button..VS button
|
||||
{0x1F1E6, 0x1F1FF, prEmojiPresentation}, // E0.0 [26] (🇦..🇿) regional indicator symbol letter a..regional indicator symbol letter z
|
||||
{0x1F201, 0x1F201, prEmojiPresentation}, // E0.6 [1] (🈁) Japanese “here” button
|
||||
{0x1F21A, 0x1F21A, prEmojiPresentation}, // E0.6 [1] (🈚) Japanese “free of charge” button
|
||||
{0x1F22F, 0x1F22F, prEmojiPresentation}, // E0.6 [1] (🈯) Japanese “reserved” button
|
||||
{0x1F232, 0x1F236, prEmojiPresentation}, // E0.6 [5] (🈲..🈶) Japanese “prohibited” button..Japanese “not free of charge” button
|
||||
{0x1F238, 0x1F23A, prEmojiPresentation}, // E0.6 [3] (🈸..🈺) Japanese “application” button..Japanese “open for business” button
|
||||
{0x1F250, 0x1F251, prEmojiPresentation}, // E0.6 [2] (🉐..🉑) Japanese “bargain” button..Japanese “acceptable” button
|
||||
{0x1F300, 0x1F30C, prEmojiPresentation}, // E0.6 [13] (🌀..🌌) cyclone..milky way
|
||||
{0x1F30D, 0x1F30E, prEmojiPresentation}, // E0.7 [2] (🌍..🌎) globe showing Europe-Africa..globe showing Americas
|
||||
{0x1F30F, 0x1F30F, prEmojiPresentation}, // E0.6 [1] (🌏) globe showing Asia-Australia
|
||||
{0x1F310, 0x1F310, prEmojiPresentation}, // E1.0 [1] (🌐) globe with meridians
|
||||
{0x1F311, 0x1F311, prEmojiPresentation}, // E0.6 [1] (🌑) new moon
|
||||
{0x1F312, 0x1F312, prEmojiPresentation}, // E1.0 [1] (🌒) waxing crescent moon
|
||||
{0x1F313, 0x1F315, prEmojiPresentation}, // E0.6 [3] (🌓..🌕) first quarter moon..full moon
|
||||
{0x1F316, 0x1F318, prEmojiPresentation}, // E1.0 [3] (🌖..🌘) waning gibbous moon..waning crescent moon
|
||||
{0x1F319, 0x1F319, prEmojiPresentation}, // E0.6 [1] (🌙) crescent moon
|
||||
{0x1F31A, 0x1F31A, prEmojiPresentation}, // E1.0 [1] (🌚) new moon face
|
||||
{0x1F31B, 0x1F31B, prEmojiPresentation}, // E0.6 [1] (🌛) first quarter moon face
|
||||
{0x1F31C, 0x1F31C, prEmojiPresentation}, // E0.7 [1] (🌜) last quarter moon face
|
||||
{0x1F31D, 0x1F31E, prEmojiPresentation}, // E1.0 [2] (🌝..🌞) full moon face..sun with face
|
||||
{0x1F31F, 0x1F320, prEmojiPresentation}, // E0.6 [2] (🌟..🌠) glowing star..shooting star
|
||||
{0x1F32D, 0x1F32F, prEmojiPresentation}, // E1.0 [3] (🌭..🌯) hot dog..burrito
|
||||
{0x1F330, 0x1F331, prEmojiPresentation}, // E0.6 [2] (🌰..🌱) chestnut..seedling
|
||||
{0x1F332, 0x1F333, prEmojiPresentation}, // E1.0 [2] (🌲..🌳) evergreen tree..deciduous tree
|
||||
{0x1F334, 0x1F335, prEmojiPresentation}, // E0.6 [2] (🌴..🌵) palm tree..cactus
|
||||
{0x1F337, 0x1F34A, prEmojiPresentation}, // E0.6 [20] (🌷..🍊) tulip..tangerine
|
||||
{0x1F34B, 0x1F34B, prEmojiPresentation}, // E1.0 [1] (🍋) lemon
|
||||
{0x1F34C, 0x1F34F, prEmojiPresentation}, // E0.6 [4] (🍌..🍏) banana..green apple
|
||||
{0x1F350, 0x1F350, prEmojiPresentation}, // E1.0 [1] (🍐) pear
|
||||
{0x1F351, 0x1F37B, prEmojiPresentation}, // E0.6 [43] (🍑..🍻) peach..clinking beer mugs
|
||||
{0x1F37C, 0x1F37C, prEmojiPresentation}, // E1.0 [1] (🍼) baby bottle
|
||||
{0x1F37E, 0x1F37F, prEmojiPresentation}, // E1.0 [2] (🍾..🍿) bottle with popping cork..popcorn
|
||||
{0x1F380, 0x1F393, prEmojiPresentation}, // E0.6 [20] (🎀..🎓) ribbon..graduation cap
|
||||
{0x1F3A0, 0x1F3C4, prEmojiPresentation}, // E0.6 [37] (🎠..🏄) carousel horse..person surfing
|
||||
{0x1F3C5, 0x1F3C5, prEmojiPresentation}, // E1.0 [1] (🏅) sports medal
|
||||
{0x1F3C6, 0x1F3C6, prEmojiPresentation}, // E0.6 [1] (🏆) trophy
|
||||
{0x1F3C7, 0x1F3C7, prEmojiPresentation}, // E1.0 [1] (🏇) horse racing
|
||||
{0x1F3C8, 0x1F3C8, prEmojiPresentation}, // E0.6 [1] (🏈) american football
|
||||
{0x1F3C9, 0x1F3C9, prEmojiPresentation}, // E1.0 [1] (🏉) rugby football
|
||||
{0x1F3CA, 0x1F3CA, prEmojiPresentation}, // E0.6 [1] (🏊) person swimming
|
||||
{0x1F3CF, 0x1F3D3, prEmojiPresentation}, // E1.0 [5] (🏏..🏓) cricket game..ping pong
|
||||
{0x1F3E0, 0x1F3E3, prEmojiPresentation}, // E0.6 [4] (🏠..🏣) house..Japanese post office
|
||||
{0x1F3E4, 0x1F3E4, prEmojiPresentation}, // E1.0 [1] (🏤) post office
|
||||
{0x1F3E5, 0x1F3F0, prEmojiPresentation}, // E0.6 [12] (🏥..🏰) hospital..castle
|
||||
{0x1F3F4, 0x1F3F4, prEmojiPresentation}, // E1.0 [1] (🏴) black flag
|
||||
{0x1F3F8, 0x1F407, prEmojiPresentation}, // E1.0 [16] (🏸..🐇) badminton..rabbit
|
||||
{0x1F408, 0x1F408, prEmojiPresentation}, // E0.7 [1] (🐈) cat
|
||||
{0x1F409, 0x1F40B, prEmojiPresentation}, // E1.0 [3] (🐉..🐋) dragon..whale
|
||||
{0x1F40C, 0x1F40E, prEmojiPresentation}, // E0.6 [3] (🐌..🐎) snail..horse
|
||||
{0x1F40F, 0x1F410, prEmojiPresentation}, // E1.0 [2] (🐏..🐐) ram..goat
|
||||
{0x1F411, 0x1F412, prEmojiPresentation}, // E0.6 [2] (🐑..🐒) ewe..monkey
|
||||
{0x1F413, 0x1F413, prEmojiPresentation}, // E1.0 [1] (🐓) rooster
|
||||
{0x1F414, 0x1F414, prEmojiPresentation}, // E0.6 [1] (🐔) chicken
|
||||
{0x1F415, 0x1F415, prEmojiPresentation}, // E0.7 [1] (🐕) dog
|
||||
{0x1F416, 0x1F416, prEmojiPresentation}, // E1.0 [1] (🐖) pig
|
||||
{0x1F417, 0x1F429, prEmojiPresentation}, // E0.6 [19] (🐗..🐩) boar..poodle
|
||||
{0x1F42A, 0x1F42A, prEmojiPresentation}, // E1.0 [1] (🐪) camel
|
||||
{0x1F42B, 0x1F43E, prEmojiPresentation}, // E0.6 [20] (🐫..🐾) two-hump camel..paw prints
|
||||
{0x1F440, 0x1F440, prEmojiPresentation}, // E0.6 [1] (👀) eyes
|
||||
{0x1F442, 0x1F464, prEmojiPresentation}, // E0.6 [35] (👂..👤) ear..bust in silhouette
|
||||
{0x1F465, 0x1F465, prEmojiPresentation}, // E1.0 [1] (👥) busts in silhouette
|
||||
{0x1F466, 0x1F46B, prEmojiPresentation}, // E0.6 [6] (👦..👫) boy..woman and man holding hands
|
||||
{0x1F46C, 0x1F46D, prEmojiPresentation}, // E1.0 [2] (👬..👭) men holding hands..women holding hands
|
||||
{0x1F46E, 0x1F4AC, prEmojiPresentation}, // E0.6 [63] (👮..💬) police officer..speech balloon
|
||||
{0x1F4AD, 0x1F4AD, prEmojiPresentation}, // E1.0 [1] (💭) thought balloon
|
||||
{0x1F4AE, 0x1F4B5, prEmojiPresentation}, // E0.6 [8] (💮..💵) white flower..dollar banknote
|
||||
{0x1F4B6, 0x1F4B7, prEmojiPresentation}, // E1.0 [2] (💶..💷) euro banknote..pound banknote
|
||||
{0x1F4B8, 0x1F4EB, prEmojiPresentation}, // E0.6 [52] (💸..📫) money with wings..closed mailbox with raised flag
|
||||
{0x1F4EC, 0x1F4ED, prEmojiPresentation}, // E0.7 [2] (📬..📭) open mailbox with raised flag..open mailbox with lowered flag
|
||||
{0x1F4EE, 0x1F4EE, prEmojiPresentation}, // E0.6 [1] (📮) postbox
|
||||
{0x1F4EF, 0x1F4EF, prEmojiPresentation}, // E1.0 [1] (📯) postal horn
|
||||
{0x1F4F0, 0x1F4F4, prEmojiPresentation}, // E0.6 [5] (📰..📴) newspaper..mobile phone off
|
||||
{0x1F4F5, 0x1F4F5, prEmojiPresentation}, // E1.0 [1] (📵) no mobile phones
|
||||
{0x1F4F6, 0x1F4F7, prEmojiPresentation}, // E0.6 [2] (📶..📷) antenna bars..camera
|
||||
{0x1F4F8, 0x1F4F8, prEmojiPresentation}, // E1.0 [1] (📸) camera with flash
|
||||
{0x1F4F9, 0x1F4FC, prEmojiPresentation}, // E0.6 [4] (📹..📼) video camera..videocassette
|
||||
{0x1F4FF, 0x1F502, prEmojiPresentation}, // E1.0 [4] (📿..🔂) prayer beads..repeat single button
|
||||
{0x1F503, 0x1F503, prEmojiPresentation}, // E0.6 [1] (🔃) clockwise vertical arrows
|
||||
{0x1F504, 0x1F507, prEmojiPresentation}, // E1.0 [4] (🔄..🔇) counterclockwise arrows button..muted speaker
|
||||
{0x1F508, 0x1F508, prEmojiPresentation}, // E0.7 [1] (🔈) speaker low volume
|
||||
{0x1F509, 0x1F509, prEmojiPresentation}, // E1.0 [1] (🔉) speaker medium volume
|
||||
{0x1F50A, 0x1F514, prEmojiPresentation}, // E0.6 [11] (🔊..🔔) speaker high volume..bell
|
||||
{0x1F515, 0x1F515, prEmojiPresentation}, // E1.0 [1] (🔕) bell with slash
|
||||
{0x1F516, 0x1F52B, prEmojiPresentation}, // E0.6 [22] (🔖..🔫) bookmark..water pistol
|
||||
{0x1F52C, 0x1F52D, prEmojiPresentation}, // E1.0 [2] (🔬..🔭) microscope..telescope
|
||||
{0x1F52E, 0x1F53D, prEmojiPresentation}, // E0.6 [16] (🔮..🔽) crystal ball..downwards button
|
||||
{0x1F54B, 0x1F54E, prEmojiPresentation}, // E1.0 [4] (🕋..🕎) kaaba..menorah
|
||||
{0x1F550, 0x1F55B, prEmojiPresentation}, // E0.6 [12] (🕐..🕛) one o’clock..twelve o’clock
|
||||
{0x1F55C, 0x1F567, prEmojiPresentation}, // E0.7 [12] (🕜..🕧) one-thirty..twelve-thirty
|
||||
{0x1F57A, 0x1F57A, prEmojiPresentation}, // E3.0 [1] (🕺) man dancing
|
||||
{0x1F595, 0x1F596, prEmojiPresentation}, // E1.0 [2] (🖕..🖖) middle finger..vulcan salute
|
||||
{0x1F5A4, 0x1F5A4, prEmojiPresentation}, // E3.0 [1] (🖤) black heart
|
||||
{0x1F5FB, 0x1F5FF, prEmojiPresentation}, // E0.6 [5] (🗻..🗿) mount fuji..moai
|
||||
{0x1F600, 0x1F600, prEmojiPresentation}, // E1.0 [1] (😀) grinning face
|
||||
{0x1F601, 0x1F606, prEmojiPresentation}, // E0.6 [6] (😁..😆) beaming face with smiling eyes..grinning squinting face
|
||||
{0x1F607, 0x1F608, prEmojiPresentation}, // E1.0 [2] (😇..😈) smiling face with halo..smiling face with horns
|
||||
{0x1F609, 0x1F60D, prEmojiPresentation}, // E0.6 [5] (😉..😍) winking face..smiling face with heart-eyes
|
||||
{0x1F60E, 0x1F60E, prEmojiPresentation}, // E1.0 [1] (😎) smiling face with sunglasses
|
||||
{0x1F60F, 0x1F60F, prEmojiPresentation}, // E0.6 [1] (😏) smirking face
|
||||
{0x1F610, 0x1F610, prEmojiPresentation}, // E0.7 [1] (😐) neutral face
|
||||
{0x1F611, 0x1F611, prEmojiPresentation}, // E1.0 [1] (😑) expressionless face
|
||||
{0x1F612, 0x1F614, prEmojiPresentation}, // E0.6 [3] (😒..😔) unamused face..pensive face
|
||||
{0x1F615, 0x1F615, prEmojiPresentation}, // E1.0 [1] (😕) confused face
|
||||
{0x1F616, 0x1F616, prEmojiPresentation}, // E0.6 [1] (😖) confounded face
|
||||
{0x1F617, 0x1F617, prEmojiPresentation}, // E1.0 [1] (😗) kissing face
|
||||
{0x1F618, 0x1F618, prEmojiPresentation}, // E0.6 [1] (😘) face blowing a kiss
|
||||
{0x1F619, 0x1F619, prEmojiPresentation}, // E1.0 [1] (😙) kissing face with smiling eyes
|
||||
{0x1F61A, 0x1F61A, prEmojiPresentation}, // E0.6 [1] (😚) kissing face with closed eyes
|
||||
{0x1F61B, 0x1F61B, prEmojiPresentation}, // E1.0 [1] (😛) face with tongue
|
||||
{0x1F61C, 0x1F61E, prEmojiPresentation}, // E0.6 [3] (😜..😞) winking face with tongue..disappointed face
|
||||
{0x1F61F, 0x1F61F, prEmojiPresentation}, // E1.0 [1] (😟) worried face
|
||||
{0x1F620, 0x1F625, prEmojiPresentation}, // E0.6 [6] (😠..😥) angry face..sad but relieved face
|
||||
{0x1F626, 0x1F627, prEmojiPresentation}, // E1.0 [2] (😦..😧) frowning face with open mouth..anguished face
|
||||
{0x1F628, 0x1F62B, prEmojiPresentation}, // E0.6 [4] (😨..😫) fearful face..tired face
|
||||
{0x1F62C, 0x1F62C, prEmojiPresentation}, // E1.0 [1] (😬) grimacing face
|
||||
{0x1F62D, 0x1F62D, prEmojiPresentation}, // E0.6 [1] (😭) loudly crying face
|
||||
{0x1F62E, 0x1F62F, prEmojiPresentation}, // E1.0 [2] (😮..😯) face with open mouth..hushed face
|
||||
{0x1F630, 0x1F633, prEmojiPresentation}, // E0.6 [4] (😰..😳) anxious face with sweat..flushed face
|
||||
{0x1F634, 0x1F634, prEmojiPresentation}, // E1.0 [1] (😴) sleeping face
|
||||
{0x1F635, 0x1F635, prEmojiPresentation}, // E0.6 [1] (😵) face with crossed-out eyes
|
||||
{0x1F636, 0x1F636, prEmojiPresentation}, // E1.0 [1] (😶) face without mouth
|
||||
{0x1F637, 0x1F640, prEmojiPresentation}, // E0.6 [10] (😷..🙀) face with medical mask..weary cat
|
||||
{0x1F641, 0x1F644, prEmojiPresentation}, // E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
|
||||
{0x1F645, 0x1F64F, prEmojiPresentation}, // E0.6 [11] (🙅..🙏) person gesturing NO..folded hands
|
||||
{0x1F680, 0x1F680, prEmojiPresentation}, // E0.6 [1] (🚀) rocket
|
||||
{0x1F681, 0x1F682, prEmojiPresentation}, // E1.0 [2] (🚁..🚂) helicopter..locomotive
|
||||
{0x1F683, 0x1F685, prEmojiPresentation}, // E0.6 [3] (🚃..🚅) railway car..bullet train
|
||||
{0x1F686, 0x1F686, prEmojiPresentation}, // E1.0 [1] (🚆) train
|
||||
{0x1F687, 0x1F687, prEmojiPresentation}, // E0.6 [1] (🚇) metro
|
||||
{0x1F688, 0x1F688, prEmojiPresentation}, // E1.0 [1] (🚈) light rail
|
||||
{0x1F689, 0x1F689, prEmojiPresentation}, // E0.6 [1] (🚉) station
|
||||
{0x1F68A, 0x1F68B, prEmojiPresentation}, // E1.0 [2] (🚊..🚋) tram..tram car
|
||||
{0x1F68C, 0x1F68C, prEmojiPresentation}, // E0.6 [1] (🚌) bus
|
||||
{0x1F68D, 0x1F68D, prEmojiPresentation}, // E0.7 [1] (🚍) oncoming bus
|
||||
{0x1F68E, 0x1F68E, prEmojiPresentation}, // E1.0 [1] (🚎) trolleybus
|
||||
{0x1F68F, 0x1F68F, prEmojiPresentation}, // E0.6 [1] (🚏) bus stop
|
||||
{0x1F690, 0x1F690, prEmojiPresentation}, // E1.0 [1] (🚐) minibus
|
||||
{0x1F691, 0x1F693, prEmojiPresentation}, // E0.6 [3] (🚑..🚓) ambulance..police car
|
||||
{0x1F694, 0x1F694, prEmojiPresentation}, // E0.7 [1] (🚔) oncoming police car
|
||||
{0x1F695, 0x1F695, prEmojiPresentation}, // E0.6 [1] (🚕) taxi
|
||||
{0x1F696, 0x1F696, prEmojiPresentation}, // E1.0 [1] (🚖) oncoming taxi
|
||||
{0x1F697, 0x1F697, prEmojiPresentation}, // E0.6 [1] (🚗) automobile
|
||||
{0x1F698, 0x1F698, prEmojiPresentation}, // E0.7 [1] (🚘) oncoming automobile
|
||||
{0x1F699, 0x1F69A, prEmojiPresentation}, // E0.6 [2] (🚙..🚚) sport utility vehicle..delivery truck
|
||||
{0x1F69B, 0x1F6A1, prEmojiPresentation}, // E1.0 [7] (🚛..🚡) articulated lorry..aerial tramway
|
||||
{0x1F6A2, 0x1F6A2, prEmojiPresentation}, // E0.6 [1] (🚢) ship
|
||||
{0x1F6A3, 0x1F6A3, prEmojiPresentation}, // E1.0 [1] (🚣) person rowing boat
|
||||
{0x1F6A4, 0x1F6A5, prEmojiPresentation}, // E0.6 [2] (🚤..🚥) speedboat..horizontal traffic light
|
||||
{0x1F6A6, 0x1F6A6, prEmojiPresentation}, // E1.0 [1] (🚦) vertical traffic light
|
||||
{0x1F6A7, 0x1F6AD, prEmojiPresentation}, // E0.6 [7] (🚧..🚭) construction..no smoking
|
||||
{0x1F6AE, 0x1F6B1, prEmojiPresentation}, // E1.0 [4] (🚮..🚱) litter in bin sign..non-potable water
|
||||
{0x1F6B2, 0x1F6B2, prEmojiPresentation}, // E0.6 [1] (🚲) bicycle
|
||||
{0x1F6B3, 0x1F6B5, prEmojiPresentation}, // E1.0 [3] (🚳..🚵) no bicycles..person mountain biking
|
||||
{0x1F6B6, 0x1F6B6, prEmojiPresentation}, // E0.6 [1] (🚶) person walking
|
||||
{0x1F6B7, 0x1F6B8, prEmojiPresentation}, // E1.0 [2] (🚷..🚸) no pedestrians..children crossing
|
||||
{0x1F6B9, 0x1F6BE, prEmojiPresentation}, // E0.6 [6] (🚹..🚾) men’s room..water closet
|
||||
{0x1F6BF, 0x1F6BF, prEmojiPresentation}, // E1.0 [1] (🚿) shower
|
||||
{0x1F6C0, 0x1F6C0, prEmojiPresentation}, // E0.6 [1] (🛀) person taking bath
|
||||
{0x1F6C1, 0x1F6C5, prEmojiPresentation}, // E1.0 [5] (🛁..🛅) bathtub..left luggage
|
||||
{0x1F6CC, 0x1F6CC, prEmojiPresentation}, // E1.0 [1] (🛌) person in bed
|
||||
{0x1F6D0, 0x1F6D0, prEmojiPresentation}, // E1.0 [1] (🛐) place of worship
|
||||
{0x1F6D1, 0x1F6D2, prEmojiPresentation}, // E3.0 [2] (🛑..🛒) stop sign..shopping cart
|
||||
{0x1F6D5, 0x1F6D5, prEmojiPresentation}, // E12.0 [1] (🛕) hindu temple
|
||||
{0x1F6D6, 0x1F6D7, prEmojiPresentation}, // E13.0 [2] (🛖..🛗) hut..elevator
|
||||
{0x1F6DC, 0x1F6DC, prEmojiPresentation}, // E15.0 [1] (🛜) wireless
|
||||
{0x1F6DD, 0x1F6DF, prEmojiPresentation}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
|
||||
{0x1F6EB, 0x1F6EC, prEmojiPresentation}, // E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
|
||||
{0x1F6F4, 0x1F6F6, prEmojiPresentation}, // E3.0 [3] (🛴..🛶) kick scooter..canoe
|
||||
{0x1F6F7, 0x1F6F8, prEmojiPresentation}, // E5.0 [2] (🛷..🛸) sled..flying saucer
|
||||
{0x1F6F9, 0x1F6F9, prEmojiPresentation}, // E11.0 [1] (🛹) skateboard
|
||||
{0x1F6FA, 0x1F6FA, prEmojiPresentation}, // E12.0 [1] (🛺) auto rickshaw
|
||||
{0x1F6FB, 0x1F6FC, prEmojiPresentation}, // E13.0 [2] (🛻..🛼) pickup truck..roller skate
|
||||
{0x1F7E0, 0x1F7EB, prEmojiPresentation}, // E12.0 [12] (🟠..🟫) orange circle..brown square
|
||||
{0x1F7F0, 0x1F7F0, prEmojiPresentation}, // E14.0 [1] (🟰) heavy equals sign
|
||||
{0x1F90C, 0x1F90C, prEmojiPresentation}, // E13.0 [1] (🤌) pinched fingers
|
||||
{0x1F90D, 0x1F90F, prEmojiPresentation}, // E12.0 [3] (🤍..🤏) white heart..pinching hand
|
||||
{0x1F910, 0x1F918, prEmojiPresentation}, // E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
|
||||
{0x1F919, 0x1F91E, prEmojiPresentation}, // E3.0 [6] (🤙..🤞) call me hand..crossed fingers
|
||||
{0x1F91F, 0x1F91F, prEmojiPresentation}, // E5.0 [1] (🤟) love-you gesture
|
||||
{0x1F920, 0x1F927, prEmojiPresentation}, // E3.0 [8] (🤠..🤧) cowboy hat face..sneezing face
|
||||
{0x1F928, 0x1F92F, prEmojiPresentation}, // E5.0 [8] (🤨..🤯) face with raised eyebrow..exploding head
|
||||
{0x1F930, 0x1F930, prEmojiPresentation}, // E3.0 [1] (🤰) pregnant woman
|
||||
{0x1F931, 0x1F932, prEmojiPresentation}, // E5.0 [2] (🤱..🤲) breast-feeding..palms up together
|
||||
{0x1F933, 0x1F93A, prEmojiPresentation}, // E3.0 [8] (🤳..🤺) selfie..person fencing
|
||||
{0x1F93C, 0x1F93E, prEmojiPresentation}, // E3.0 [3] (🤼..🤾) people wrestling..person playing handball
|
||||
{0x1F93F, 0x1F93F, prEmojiPresentation}, // E12.0 [1] (🤿) diving mask
|
||||
{0x1F940, 0x1F945, prEmojiPresentation}, // E3.0 [6] (🥀..🥅) wilted flower..goal net
|
||||
{0x1F947, 0x1F94B, prEmojiPresentation}, // E3.0 [5] (🥇..🥋) 1st place medal..martial arts uniform
|
||||
{0x1F94C, 0x1F94C, prEmojiPresentation}, // E5.0 [1] (🥌) curling stone
|
||||
{0x1F94D, 0x1F94F, prEmojiPresentation}, // E11.0 [3] (🥍..🥏) lacrosse..flying disc
|
||||
{0x1F950, 0x1F95E, prEmojiPresentation}, // E3.0 [15] (🥐..🥞) croissant..pancakes
|
||||
{0x1F95F, 0x1F96B, prEmojiPresentation}, // E5.0 [13] (🥟..🥫) dumpling..canned food
|
||||
{0x1F96C, 0x1F970, prEmojiPresentation}, // E11.0 [5] (🥬..🥰) leafy green..smiling face with hearts
|
||||
{0x1F971, 0x1F971, prEmojiPresentation}, // E12.0 [1] (🥱) yawning face
|
||||
{0x1F972, 0x1F972, prEmojiPresentation}, // E13.0 [1] (🥲) smiling face with tear
|
||||
{0x1F973, 0x1F976, prEmojiPresentation}, // E11.0 [4] (🥳..🥶) partying face..cold face
|
||||
{0x1F977, 0x1F978, prEmojiPresentation}, // E13.0 [2] (🥷..🥸) ninja..disguised face
|
||||
{0x1F979, 0x1F979, prEmojiPresentation}, // E14.0 [1] (🥹) face holding back tears
|
||||
{0x1F97A, 0x1F97A, prEmojiPresentation}, // E11.0 [1] (🥺) pleading face
|
||||
{0x1F97B, 0x1F97B, prEmojiPresentation}, // E12.0 [1] (🥻) sari
|
||||
{0x1F97C, 0x1F97F, prEmojiPresentation}, // E11.0 [4] (🥼..🥿) lab coat..flat shoe
|
||||
{0x1F980, 0x1F984, prEmojiPresentation}, // E1.0 [5] (🦀..🦄) crab..unicorn
|
||||
{0x1F985, 0x1F991, prEmojiPresentation}, // E3.0 [13] (🦅..🦑) eagle..squid
|
||||
{0x1F992, 0x1F997, prEmojiPresentation}, // E5.0 [6] (🦒..🦗) giraffe..cricket
|
||||
{0x1F998, 0x1F9A2, prEmojiPresentation}, // E11.0 [11] (🦘..🦢) kangaroo..swan
|
||||
{0x1F9A3, 0x1F9A4, prEmojiPresentation}, // E13.0 [2] (🦣..🦤) mammoth..dodo
|
||||
{0x1F9A5, 0x1F9AA, prEmojiPresentation}, // E12.0 [6] (🦥..🦪) sloth..oyster
|
||||
{0x1F9AB, 0x1F9AD, prEmojiPresentation}, // E13.0 [3] (🦫..🦭) beaver..seal
|
||||
{0x1F9AE, 0x1F9AF, prEmojiPresentation}, // E12.0 [2] (🦮..🦯) guide dog..white cane
|
||||
{0x1F9B0, 0x1F9B9, prEmojiPresentation}, // E11.0 [10] (🦰..🦹) red hair..supervillain
|
||||
{0x1F9BA, 0x1F9BF, prEmojiPresentation}, // E12.0 [6] (🦺..🦿) safety vest..mechanical leg
|
||||
{0x1F9C0, 0x1F9C0, prEmojiPresentation}, // E1.0 [1] (🧀) cheese wedge
|
||||
{0x1F9C1, 0x1F9C2, prEmojiPresentation}, // E11.0 [2] (🧁..🧂) cupcake..salt
|
||||
{0x1F9C3, 0x1F9CA, prEmojiPresentation}, // E12.0 [8] (🧃..🧊) beverage box..ice
|
||||
{0x1F9CB, 0x1F9CB, prEmojiPresentation}, // E13.0 [1] (🧋) bubble tea
|
||||
{0x1F9CC, 0x1F9CC, prEmojiPresentation}, // E14.0 [1] (🧌) troll
|
||||
{0x1F9CD, 0x1F9CF, prEmojiPresentation}, // E12.0 [3] (🧍..🧏) person standing..deaf person
|
||||
{0x1F9D0, 0x1F9E6, prEmojiPresentation}, // E5.0 [23] (🧐..🧦) face with monocle..socks
|
||||
{0x1F9E7, 0x1F9FF, prEmojiPresentation}, // E11.0 [25] (🧧..🧿) red envelope..nazar amulet
|
||||
{0x1FA70, 0x1FA73, prEmojiPresentation}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
|
||||
{0x1FA74, 0x1FA74, prEmojiPresentation}, // E13.0 [1] (🩴) thong sandal
|
||||
{0x1FA75, 0x1FA77, prEmojiPresentation}, // E15.0 [3] (🩵..🩷) light blue heart..pink heart
|
||||
{0x1FA78, 0x1FA7A, prEmojiPresentation}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
|
||||
{0x1FA7B, 0x1FA7C, prEmojiPresentation}, // E14.0 [2] (🩻..🩼) x-ray..crutch
|
||||
{0x1FA80, 0x1FA82, prEmojiPresentation}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
|
||||
{0x1FA83, 0x1FA86, prEmojiPresentation}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
|
||||
{0x1FA87, 0x1FA88, prEmojiPresentation}, // E15.0 [2] (🪇..🪈) maracas..flute
|
||||
{0x1FA90, 0x1FA95, prEmojiPresentation}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
|
||||
{0x1FA96, 0x1FAA8, prEmojiPresentation}, // E13.0 [19] (🪖..🪨) military helmet..rock
|
||||
{0x1FAA9, 0x1FAAC, prEmojiPresentation}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
|
||||
{0x1FAAD, 0x1FAAF, prEmojiPresentation}, // E15.0 [3] (🪭..🪯) folding hand fan..khanda
|
||||
{0x1FAB0, 0x1FAB6, prEmojiPresentation}, // E13.0 [7] (🪰..🪶) fly..feather
|
||||
{0x1FAB7, 0x1FABA, prEmojiPresentation}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
|
||||
{0x1FABB, 0x1FABD, prEmojiPresentation}, // E15.0 [3] (🪻..🪽) hyacinth..wing
|
||||
{0x1FABF, 0x1FABF, prEmojiPresentation}, // E15.0 [1] (🪿) goose
|
||||
{0x1FAC0, 0x1FAC2, prEmojiPresentation}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
|
||||
{0x1FAC3, 0x1FAC5, prEmojiPresentation}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
|
||||
{0x1FACE, 0x1FACF, prEmojiPresentation}, // E15.0 [2] (🫎..🫏) moose..donkey
|
||||
{0x1FAD0, 0x1FAD6, prEmojiPresentation}, // E13.0 [7] (🫐..🫖) blueberries..teapot
|
||||
{0x1FAD7, 0x1FAD9, prEmojiPresentation}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
|
||||
{0x1FADA, 0x1FADB, prEmojiPresentation}, // E15.0 [2] (🫚..🫛) ginger root..pea pod
|
||||
{0x1FAE0, 0x1FAE7, prEmojiPresentation}, // E14.0 [8] (🫠..🫧) melting face..bubbles
|
||||
{0x1FAE8, 0x1FAE8, prEmojiPresentation}, // E15.0 [1] (🫨) shaking face
|
||||
{0x1FAF0, 0x1FAF6, prEmojiPresentation}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
|
||||
{0x1FAF7, 0x1FAF8, prEmojiPresentation}, // E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
|
||||
}
|
215
vendor/github.com/rivo/uniseg/gen_breaktest.go
generated
vendored
Normal file
215
vendor/github.com/rivo/uniseg/gen_breaktest.go
generated
vendored
Normal file
@ -0,0 +1,215 @@
|
||||
//go:build generate
|
||||
|
||||
// This program generates a Go containing a slice of test cases based on the
|
||||
// Unicode Character Database auxiliary data files. The command line arguments
|
||||
// are as follows:
|
||||
//
|
||||
// 1. The name of the Unicode data file (just the filename, without extension).
|
||||
// 2. The name of the locally generated Go file.
|
||||
// 3. The name of the slice containing the test cases.
|
||||
// 4. The name of the generator, for logging purposes.
|
||||
//
|
||||
//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
|
||||
//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
|
||||
//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
|
||||
//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"go/format"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// We want to test against a specific version rather than the latest. When the
|
||||
// package is upgraded to a new version, change these to generate new tests.
|
||||
const (
|
||||
testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
|
||||
)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 5 {
|
||||
fmt.Println("Not enough arguments, see code for details")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
|
||||
log.SetFlags(0)
|
||||
|
||||
// Read text of testcases and parse into Go source code.
|
||||
src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// Format the Go code.
|
||||
formatted, err := format.Source(src)
|
||||
if err != nil {
|
||||
log.Fatalln("gofmt:", err)
|
||||
}
|
||||
|
||||
// Write it out.
|
||||
log.Print("Writing to ", os.Args[2])
|
||||
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// parse reads a break text file, either from a local file or from a URL. It
|
||||
// parses the file data into Go source code representing the test cases.
|
||||
func parse(url string) ([]byte, error) {
|
||||
log.Printf("Parsing %s", url)
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
body := res.Body
|
||||
defer body.Close()
|
||||
|
||||
buf := new(bytes.Buffer)
|
||||
buf.Grow(120 << 10)
|
||||
buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
|
||||
|
||||
package uniseg
|
||||
|
||||
// ` + os.Args[3] + ` are Grapheme testcases taken from
|
||||
// ` + url + `
|
||||
// on ` + time.Now().Format("January 2, 2006") + `. See
|
||||
// https://www.unicode.org/license.html for the Unicode license agreement.
|
||||
var ` + os.Args[3] + ` = []testCase {
|
||||
`)
|
||||
|
||||
sc := bufio.NewScanner(body)
|
||||
num := 1
|
||||
var line []byte
|
||||
original := make([]byte, 0, 64)
|
||||
expected := make([]byte, 0, 64)
|
||||
for sc.Scan() {
|
||||
num++
|
||||
line = sc.Bytes()
|
||||
if len(line) == 0 || line[0] == '#' {
|
||||
continue
|
||||
}
|
||||
var comment []byte
|
||||
if i := bytes.IndexByte(line, '#'); i >= 0 {
|
||||
comment = bytes.TrimSpace(line[i+1:])
|
||||
line = bytes.TrimSpace(line[:i])
|
||||
}
|
||||
original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
|
||||
}
|
||||
fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
|
||||
}
|
||||
if err := sc.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check for final "# EOF", useful check if we're streaming via HTTP
|
||||
if !bytes.Equal(line, []byte("# EOF")) {
|
||||
return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
|
||||
}
|
||||
buf.WriteString("}\n")
|
||||
return buf.Bytes(), nil
|
||||
}
|
||||
|
||||
// Used by parseRuneSequence to match input via bytes.HasPrefix.
|
||||
var (
|
||||
prefixBreak = []byte("÷ ")
|
||||
prefixDontBreak = []byte("× ")
|
||||
breakOk = []byte("÷")
|
||||
breakNo = []byte("×")
|
||||
)
|
||||
|
||||
// parseRuneSequence parses a rune + breaking opportunity sequence from b
|
||||
// and appends the Go code for testcase.original to orig
|
||||
// and appends the Go code for testcase.expected to exp.
|
||||
// It retuns the new orig and exp slices.
|
||||
//
|
||||
// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
|
||||
// it will append
|
||||
//
|
||||
// "\u0020\u0308\U0001F1E6"
|
||||
//
|
||||
// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
|
||||
// to orig and exp respectively.
|
||||
//
|
||||
// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
|
||||
// Note we explicitly require the sequence to start with ÷ and we implicitly
|
||||
// require it to end with ÷.
|
||||
func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
|
||||
// Check for and remove first ÷ or ×.
|
||||
if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
|
||||
return nil, nil, errors.New("expected ÷ or × as first character")
|
||||
}
|
||||
if bytes.HasPrefix(b, prefixBreak) {
|
||||
b = b[len(prefixBreak):]
|
||||
} else {
|
||||
b = b[len(prefixDontBreak):]
|
||||
}
|
||||
|
||||
boundary := true
|
||||
exp = append(exp, "[][]rune{"...)
|
||||
for len(b) > 0 {
|
||||
if boundary {
|
||||
exp = append(exp, '{')
|
||||
}
|
||||
exp = append(exp, "0x"...)
|
||||
// Find end of hex digits.
|
||||
var i int
|
||||
for i = 0; i < len(b) && b[i] != ' '; i++ {
|
||||
if d := b[i]; ('0' <= d || d <= '9') ||
|
||||
('A' <= d || d <= 'F') ||
|
||||
('a' <= d || d <= 'f') {
|
||||
continue
|
||||
}
|
||||
return nil, nil, errors.New("bad hex digit")
|
||||
}
|
||||
switch i {
|
||||
case 4:
|
||||
orig = append(orig, "\\u"...)
|
||||
case 5:
|
||||
orig = append(orig, "\\U000"...)
|
||||
default:
|
||||
return nil, nil, errors.New("unsupport code point hex length")
|
||||
}
|
||||
orig = append(orig, b[:i]...)
|
||||
exp = append(exp, b[:i]...)
|
||||
b = b[i:]
|
||||
|
||||
// Check for space between hex and ÷ or ×.
|
||||
if len(b) < 1 || b[0] != ' ' {
|
||||
return nil, nil, errors.New("bad input")
|
||||
}
|
||||
b = b[1:]
|
||||
|
||||
// Check for next boundary.
|
||||
switch {
|
||||
case bytes.HasPrefix(b, breakOk):
|
||||
boundary = true
|
||||
b = b[len(breakOk):]
|
||||
case bytes.HasPrefix(b, breakNo):
|
||||
boundary = false
|
||||
b = b[len(breakNo):]
|
||||
default:
|
||||
return nil, nil, errors.New("missing ÷ or ×")
|
||||
}
|
||||
if boundary {
|
||||
exp = append(exp, '}')
|
||||
}
|
||||
exp = append(exp, ',')
|
||||
if len(b) > 0 && b[0] == ' ' {
|
||||
b = b[1:]
|
||||
}
|
||||
}
|
||||
exp = append(exp, '}')
|
||||
return orig, exp, nil
|
||||
}
|
261
vendor/github.com/rivo/uniseg/gen_properties.go
generated
vendored
Normal file
261
vendor/github.com/rivo/uniseg/gen_properties.go
generated
vendored
Normal file
@ -0,0 +1,261 @@
|
||||
//go:build generate
|
||||
|
||||
// This program generates a property file in Go file from Unicode Character
|
||||
// Database auxiliary data files. The command line arguments are as follows:
|
||||
//
|
||||
// 1. The name of the Unicode data file (just the filename, without extension).
|
||||
// Can be "-" (to skip) if the emoji flag is included.
|
||||
// 2. The name of the locally generated Go file.
|
||||
// 3. The name of the slice mapping code points to properties.
|
||||
// 4. The name of the generator, for logging purposes.
|
||||
// 5. (Optional) Flags, comma-separated. The following flags are available:
|
||||
// - "emojis=<property>": include the specified emoji properties (e.g.
|
||||
// "Extended_Pictographic").
|
||||
// - "gencat": include general category properties.
|
||||
//
|
||||
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
|
||||
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
|
||||
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
|
||||
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
|
||||
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
|
||||
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"go/format"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// We want to test against a specific version rather than the latest. When the
|
||||
// package is upgraded to a new version, change these to generate new tests.
|
||||
const (
|
||||
propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
|
||||
emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
|
||||
)
|
||||
|
||||
// The regular expression for a line containing a code point range property.
|
||||
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
|
||||
|
||||
func main() {
|
||||
if len(os.Args) < 5 {
|
||||
fmt.Println("Not enough arguments, see code for details")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
|
||||
log.SetFlags(0)
|
||||
|
||||
// Parse flags.
|
||||
flags := make(map[string]string)
|
||||
if len(os.Args) >= 6 {
|
||||
for _, flag := range strings.Split(os.Args[5], ",") {
|
||||
flagFields := strings.Split(flag, "=")
|
||||
if len(flagFields) == 1 {
|
||||
flags[flagFields[0]] = "yes"
|
||||
} else {
|
||||
flags[flagFields[0]] = flagFields[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the text file and generate Go source code from it.
|
||||
_, includeGeneralCategory := flags["gencat"]
|
||||
var mainURL string
|
||||
if os.Args[1] != "-" {
|
||||
mainURL = fmt.Sprintf(propertyURL, os.Args[1])
|
||||
}
|
||||
src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// Format the Go code.
|
||||
formatted, err := format.Source([]byte(src))
|
||||
if err != nil {
|
||||
log.Fatal("gofmt:", err)
|
||||
}
|
||||
|
||||
// Save it to the (local) target file.
|
||||
log.Print("Writing to ", os.Args[2])
|
||||
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// parse parses the Unicode Properties text files located at the given URLs and
|
||||
// returns their equivalent Go source code to be used in the uniseg package. If
|
||||
// "emojiProperty" is not an empty string, emoji code points for that emoji
|
||||
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
|
||||
// may pass an empty "propertyURL" to skip parsing the main properties file. If
|
||||
// "includeGeneralCategory" is true, the Unicode General Category property will
|
||||
// be extracted from the comments and included in the output.
|
||||
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
|
||||
if propertyURL == "" && emojiProperty == "" {
|
||||
return "", errors.New("no properties to parse")
|
||||
}
|
||||
|
||||
// Temporary buffer to hold properties.
|
||||
var properties [][4]string
|
||||
|
||||
// Open the first URL.
|
||||
if propertyURL != "" {
|
||||
log.Printf("Parsing %s", propertyURL)
|
||||
res, err := http.Get(propertyURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
in1 := res.Body
|
||||
defer in1.Close()
|
||||
|
||||
// Parse it.
|
||||
scanner := bufio.NewScanner(in1)
|
||||
num := 0
|
||||
for scanner.Scan() {
|
||||
num++
|
||||
line := strings.TrimSpace(scanner.Text())
|
||||
|
||||
// Skip comments and empty lines.
|
||||
if strings.HasPrefix(line, "#") || line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
// Everything else must be a code point range, a property and a comment.
|
||||
from, to, property, comment, err := parseProperty(line)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
|
||||
}
|
||||
properties = append(properties, [4]string{from, to, property, comment})
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// Open the second URL.
|
||||
if emojiProperty != "" {
|
||||
log.Printf("Parsing %s", emojiURL)
|
||||
res, err := http.Get(emojiURL)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
in2 := res.Body
|
||||
defer in2.Close()
|
||||
|
||||
// Parse it.
|
||||
scanner := bufio.NewScanner(in2)
|
||||
num := 0
|
||||
for scanner.Scan() {
|
||||
num++
|
||||
line := scanner.Text()
|
||||
|
||||
// Skip comments, empty lines, and everything not containing
|
||||
// "Extended_Pictographic".
|
||||
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Everything else must be a code point range, a property and a comment.
|
||||
from, to, property, comment, err := parseProperty(line)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("emojis line %d: %v", num, err)
|
||||
}
|
||||
properties = append(properties, [4]string{from, to, property, comment})
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
// Avoid overflow during binary search.
|
||||
if len(properties) >= 1<<31 {
|
||||
return "", errors.New("too many properties")
|
||||
}
|
||||
|
||||
// Sort properties.
|
||||
sort.Slice(properties, func(i, j int) bool {
|
||||
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
|
||||
right, _ := strconv.ParseUint(properties[j][0], 16, 64)
|
||||
return left < right
|
||||
})
|
||||
|
||||
// Header.
|
||||
var (
|
||||
buf bytes.Buffer
|
||||
emojiComment string
|
||||
)
|
||||
columns := 3
|
||||
if includeGeneralCategory {
|
||||
columns = 4
|
||||
}
|
||||
if emojiURL != "" {
|
||||
emojiComment = `
|
||||
// and
|
||||
// ` + emojiURL + `
|
||||
// ("Extended_Pictographic" only)`
|
||||
}
|
||||
buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
|
||||
|
||||
package uniseg
|
||||
|
||||
// ` + os.Args[3] + ` are taken from
|
||||
// ` + propertyURL + emojiComment + `
|
||||
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
|
||||
// license agreement.
|
||||
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
|
||||
`)
|
||||
|
||||
// Properties.
|
||||
for _, prop := range properties {
|
||||
if includeGeneralCategory {
|
||||
generalCategory := "gc" + prop[3][:2]
|
||||
if generalCategory == "gcL&" {
|
||||
generalCategory = "gcLC"
|
||||
}
|
||||
prop[3] = prop[3][3:]
|
||||
fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
|
||||
} else {
|
||||
fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
|
||||
}
|
||||
}
|
||||
|
||||
// Tail.
|
||||
buf.WriteString("}")
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
// parseProperty parses a line of the Unicode properties text file containing a
|
||||
// property for a code point range and returns it along with its comment.
|
||||
func parseProperty(line string) (from, to, property, comment string, err error) {
|
||||
fields := propertyPattern.FindStringSubmatch(line)
|
||||
if fields == nil {
|
||||
err = errors.New("no property found")
|
||||
return
|
||||
}
|
||||
from = fields[1]
|
||||
to = fields[3]
|
||||
if to == "" {
|
||||
to = from
|
||||
}
|
||||
property = fields[4]
|
||||
comment = fields[5]
|
||||
return
|
||||
}
|
||||
|
||||
// translateProperty translates a property name as used in the Unicode data file
|
||||
// to a variable used in the Go code.
|
||||
func translateProperty(prefix, property string) string {
|
||||
return prefix + strings.ReplaceAll(property, "_", "")
|
||||
}
|
331
vendor/github.com/rivo/uniseg/grapheme.go
generated
vendored
Normal file
331
vendor/github.com/rivo/uniseg/grapheme.go
generated
vendored
Normal file
@ -0,0 +1,331 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// Graphemes implements an iterator over Unicode grapheme clusters, or
|
||||
// user-perceived characters. While iterating, it also provides information
|
||||
// about word boundaries, sentence boundaries, line breaks, and monospace
|
||||
// character widths.
|
||||
//
|
||||
// After constructing the class via [NewGraphemes] for a given string "str",
|
||||
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
|
||||
// returns false. Inside the loop, information about the grapheme cluster as
|
||||
// well as boundary information and character width is available via the various
|
||||
// methods (see examples below).
|
||||
//
|
||||
// This class basically wraps the [StepString] parser and provides a convenient
|
||||
// interface to it. If you are only interested in some parts of this package's
|
||||
// functionality, using the specialized functions starting with "First" is
|
||||
// almost always faster.
|
||||
type Graphemes struct {
|
||||
// The original string.
|
||||
original string
|
||||
|
||||
// The remaining string to be parsed.
|
||||
remaining string
|
||||
|
||||
// The current grapheme cluster.
|
||||
cluster string
|
||||
|
||||
// The byte offset of the current grapheme cluster relative to the original
|
||||
// string.
|
||||
offset int
|
||||
|
||||
// The current boundary information of the [Step] parser.
|
||||
boundaries int
|
||||
|
||||
// The current state of the [Step] parser.
|
||||
state int
|
||||
}
|
||||
|
||||
// NewGraphemes returns a new grapheme cluster iterator.
|
||||
func NewGraphemes(str string) *Graphemes {
|
||||
return &Graphemes{
|
||||
original: str,
|
||||
remaining: str,
|
||||
state: -1,
|
||||
}
|
||||
}
|
||||
|
||||
// Next advances the iterator by one grapheme cluster and returns false if no
|
||||
// clusters are left. This function must be called before the first cluster is
|
||||
// accessed.
|
||||
func (g *Graphemes) Next() bool {
|
||||
if len(g.remaining) == 0 {
|
||||
// We're already past the end.
|
||||
g.state = -2
|
||||
g.cluster = ""
|
||||
return false
|
||||
}
|
||||
g.offset += len(g.cluster)
|
||||
g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
|
||||
return true
|
||||
}
|
||||
|
||||
// Runes returns a slice of runes (code points) which corresponds to the current
|
||||
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
|
||||
// has not yet been called, nil is returned.
|
||||
func (g *Graphemes) Runes() []rune {
|
||||
if g.state < 0 {
|
||||
return nil
|
||||
}
|
||||
return []rune(g.cluster)
|
||||
}
|
||||
|
||||
// Str returns a substring of the original string which corresponds to the
|
||||
// current grapheme cluster. If the iterator is already past the end or
|
||||
// [Graphemes.Next] has not yet been called, an empty string is returned.
|
||||
func (g *Graphemes) Str() string {
|
||||
return g.cluster
|
||||
}
|
||||
|
||||
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
|
||||
// If the iterator is already past the end or [Graphemes.Next] has not yet been
|
||||
// called, nil is returned.
|
||||
func (g *Graphemes) Bytes() []byte {
|
||||
if g.state < 0 {
|
||||
return nil
|
||||
}
|
||||
return []byte(g.cluster)
|
||||
}
|
||||
|
||||
// Positions returns the interval of the current grapheme cluster as byte
|
||||
// positions into the original string. The first returned value "from" indexes
|
||||
// the first byte and the second returned value "to" indexes the first byte that
|
||||
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
|
||||
// the original string "str". If [Graphemes.Next] has not yet been called, both
|
||||
// values are 0. If the iterator is already past the end, both values are 1.
|
||||
func (g *Graphemes) Positions() (int, int) {
|
||||
if g.state == -1 {
|
||||
return 0, 0
|
||||
} else if g.state == -2 {
|
||||
return 1, 1
|
||||
}
|
||||
return g.offset, g.offset + len(g.cluster)
|
||||
}
|
||||
|
||||
// IsWordBoundary returns true if a word ends after the current grapheme
|
||||
// cluster.
|
||||
func (g *Graphemes) IsWordBoundary() bool {
|
||||
if g.state < 0 {
|
||||
return true
|
||||
}
|
||||
return g.boundaries&MaskWord != 0
|
||||
}
|
||||
|
||||
// IsSentenceBoundary returns true if a sentence ends after the current
|
||||
// grapheme cluster.
|
||||
func (g *Graphemes) IsSentenceBoundary() bool {
|
||||
if g.state < 0 {
|
||||
return true
|
||||
}
|
||||
return g.boundaries&MaskSentence != 0
|
||||
}
|
||||
|
||||
// LineBreak returns whether the line can be broken after the current grapheme
|
||||
// cluster. A value of [LineDontBreak] means the line may not be broken, a value
|
||||
// of [LineMustBreak] means the line must be broken, and a value of
|
||||
// [LineCanBreak] means the line may or may not be broken.
|
||||
func (g *Graphemes) LineBreak() int {
|
||||
if g.state == -1 {
|
||||
return LineDontBreak
|
||||
}
|
||||
if g.state == -2 {
|
||||
return LineMustBreak
|
||||
}
|
||||
return g.boundaries & MaskLine
|
||||
}
|
||||
|
||||
// Width returns the monospace width of the current grapheme cluster.
|
||||
func (g *Graphemes) Width() int {
|
||||
if g.state < 0 {
|
||||
return 0
|
||||
}
|
||||
return g.boundaries >> ShiftWidth
|
||||
}
|
||||
|
||||
// Reset puts the iterator into its initial state such that the next call to
|
||||
// [Graphemes.Next] sets it to the first grapheme cluster again.
|
||||
func (g *Graphemes) Reset() {
|
||||
g.state = -1
|
||||
g.offset = 0
|
||||
g.cluster = ""
|
||||
g.remaining = g.original
|
||||
}
|
||||
|
||||
// GraphemeClusterCount returns the number of user-perceived characters
|
||||
// (grapheme clusters) for the given string.
|
||||
func GraphemeClusterCount(s string) (n int) {
|
||||
state := -1
|
||||
for len(s) > 0 {
|
||||
_, s, _, state = FirstGraphemeClusterInString(s, state)
|
||||
n++
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// ReverseString reverses the given string while observing grapheme cluster
|
||||
// boundaries.
|
||||
func ReverseString(s string) string {
|
||||
str := []byte(s)
|
||||
reversed := make([]byte, len(str))
|
||||
state := -1
|
||||
index := len(str)
|
||||
for len(str) > 0 {
|
||||
var cluster []byte
|
||||
cluster, str, _, state = FirstGraphemeCluster(str, state)
|
||||
index -= len(cluster)
|
||||
copy(reversed[index:], cluster)
|
||||
if index <= len(str)/2 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return string(reversed)
|
||||
}
|
||||
|
||||
// The number of bits the grapheme property must be shifted to make place for
|
||||
// grapheme states.
|
||||
const shiftGraphemePropState = 4
|
||||
|
||||
// FirstGraphemeCluster returns the first grapheme cluster found in the given
|
||||
// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
|
||||
// Cluster Boundaries]. This function can be called continuously to extract all
|
||||
// grapheme clusters from a byte slice, as illustrated in the example below.
|
||||
//
|
||||
// If you don't know the current state, for example when calling the function
|
||||
// for the first time, you must pass -1. For consecutive calls, pass the state
|
||||
// and rest slice returned by the previous call.
|
||||
//
|
||||
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
||||
// after the last byte of the identified grapheme cluster. If the length of the
|
||||
// "rest" slice is 0, the entire byte slice "b" has been processed. The
|
||||
// "cluster" byte slice is the sub-slice of the input slice containing the
|
||||
// identified grapheme cluster.
|
||||
//
|
||||
// The returned width is the width of the grapheme cluster for most monospace
|
||||
// fonts where a value of 1 represents one character cell.
|
||||
//
|
||||
// Given an empty byte slice "b", the function returns nil values.
|
||||
//
|
||||
// While slightly less convenient than using the Graphemes class, this function
|
||||
// has much better performance and makes no allocations. It lends itself well to
|
||||
// large byte slices.
|
||||
//
|
||||
// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(b) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRune(b)
|
||||
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
var prop int
|
||||
if state < 0 {
|
||||
prop = propertyGraphemes(r)
|
||||
} else {
|
||||
prop = state >> shiftGraphemePropState
|
||||
}
|
||||
return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
var firstProp int
|
||||
if state < 0 {
|
||||
state, firstProp, _ = transitionGraphemeState(state, r)
|
||||
} else {
|
||||
firstProp = state >> shiftGraphemePropState
|
||||
}
|
||||
width += runeWidth(r, firstProp)
|
||||
|
||||
// Transition until we find a boundary.
|
||||
for {
|
||||
var (
|
||||
prop int
|
||||
boundary bool
|
||||
)
|
||||
|
||||
r, l := utf8.DecodeRune(b[length:])
|
||||
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
|
||||
|
||||
if boundary {
|
||||
return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
|
||||
}
|
||||
|
||||
if firstProp == prExtendedPictographic {
|
||||
if r == vs15 {
|
||||
width = 1
|
||||
} else if r == vs16 {
|
||||
width = 2
|
||||
}
|
||||
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
||||
width += runeWidth(r, prop)
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(b) <= length {
|
||||
return b, nil, width, grAny | (prop << shiftGraphemePropState)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
|
||||
// outputs are strings.
|
||||
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
|
||||
// An empty string returns nothing.
|
||||
if len(str) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRuneInString(str)
|
||||
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
var prop int
|
||||
if state < 0 {
|
||||
prop = propertyGraphemes(r)
|
||||
} else {
|
||||
prop = state >> shiftGraphemePropState
|
||||
}
|
||||
return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
var firstProp int
|
||||
if state < 0 {
|
||||
state, firstProp, _ = transitionGraphemeState(state, r)
|
||||
} else {
|
||||
firstProp = state >> shiftGraphemePropState
|
||||
}
|
||||
width += runeWidth(r, firstProp)
|
||||
|
||||
// Transition until we find a boundary.
|
||||
for {
|
||||
var (
|
||||
prop int
|
||||
boundary bool
|
||||
)
|
||||
|
||||
r, l := utf8.DecodeRuneInString(str[length:])
|
||||
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
|
||||
|
||||
if boundary {
|
||||
return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
|
||||
}
|
||||
|
||||
if firstProp == prExtendedPictographic {
|
||||
if r == vs15 {
|
||||
width = 1
|
||||
} else if r == vs16 {
|
||||
width = 2
|
||||
}
|
||||
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
||||
width += runeWidth(r, prop)
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(str) <= length {
|
||||
return str, "", width, grAny | (prop << shiftGraphemePropState)
|
||||
}
|
||||
}
|
||||
}
|
1915
vendor/github.com/rivo/uniseg/graphemeproperties.go
generated
vendored
Normal file
1915
vendor/github.com/rivo/uniseg/graphemeproperties.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
176
vendor/github.com/rivo/uniseg/graphemerules.go
generated
vendored
Normal file
176
vendor/github.com/rivo/uniseg/graphemerules.go
generated
vendored
Normal file
@ -0,0 +1,176 @@
|
||||
package uniseg
|
||||
|
||||
// The states of the grapheme cluster parser.
|
||||
const (
|
||||
grAny = iota
|
||||
grCR
|
||||
grControlLF
|
||||
grL
|
||||
grLVV
|
||||
grLVTT
|
||||
grPrepend
|
||||
grExtendedPictographic
|
||||
grExtendedPictographicZWJ
|
||||
grRIOdd
|
||||
grRIEven
|
||||
)
|
||||
|
||||
// The grapheme cluster parser's breaking instructions.
|
||||
const (
|
||||
grNoBoundary = iota
|
||||
grBoundary
|
||||
)
|
||||
|
||||
// grTransitions implements the grapheme cluster parser's state transitions.
|
||||
// Maps state and property to a new state, a breaking instruction, and rule
|
||||
// number. The breaking instruction always refers to the boundary between the
|
||||
// last and next code point. Returns negative values if no transition is found.
|
||||
//
|
||||
// This function is used as follows:
|
||||
//
|
||||
// 1. Find specific state + specific property. Stop if found.
|
||||
// 2. Find specific state + any property.
|
||||
// 3. Find any state + specific property.
|
||||
// 4. If only (2) or (3) (but not both) was found, stop.
|
||||
// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
|
||||
// from the transition with the lower rule number, prefer (3) if rule numbers
|
||||
// are equal. Stop.
|
||||
// 6. Assume grAny and grBoundary.
|
||||
//
|
||||
// Unicode version 15.0.0.
|
||||
func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
|
||||
// It turns out that using a big switch statement is much faster than using
|
||||
// a map.
|
||||
|
||||
switch uint64(state) | uint64(prop)<<32 {
|
||||
// GB5
|
||||
case grAny | prCR<<32:
|
||||
return grCR, grBoundary, 50
|
||||
case grAny | prLF<<32:
|
||||
return grControlLF, grBoundary, 50
|
||||
case grAny | prControl<<32:
|
||||
return grControlLF, grBoundary, 50
|
||||
|
||||
// GB4
|
||||
case grCR | prAny<<32:
|
||||
return grAny, grBoundary, 40
|
||||
case grControlLF | prAny<<32:
|
||||
return grAny, grBoundary, 40
|
||||
|
||||
// GB3
|
||||
case grCR | prLF<<32:
|
||||
return grControlLF, grNoBoundary, 30
|
||||
|
||||
// GB6
|
||||
case grAny | prL<<32:
|
||||
return grL, grBoundary, 9990
|
||||
case grL | prL<<32:
|
||||
return grL, grNoBoundary, 60
|
||||
case grL | prV<<32:
|
||||
return grLVV, grNoBoundary, 60
|
||||
case grL | prLV<<32:
|
||||
return grLVV, grNoBoundary, 60
|
||||
case grL | prLVT<<32:
|
||||
return grLVTT, grNoBoundary, 60
|
||||
|
||||
// GB7
|
||||
case grAny | prLV<<32:
|
||||
return grLVV, grBoundary, 9990
|
||||
case grAny | prV<<32:
|
||||
return grLVV, grBoundary, 9990
|
||||
case grLVV | prV<<32:
|
||||
return grLVV, grNoBoundary, 70
|
||||
case grLVV | prT<<32:
|
||||
return grLVTT, grNoBoundary, 70
|
||||
|
||||
// GB8
|
||||
case grAny | prLVT<<32:
|
||||
return grLVTT, grBoundary, 9990
|
||||
case grAny | prT<<32:
|
||||
return grLVTT, grBoundary, 9990
|
||||
case grLVTT | prT<<32:
|
||||
return grLVTT, grNoBoundary, 80
|
||||
|
||||
// GB9
|
||||
case grAny | prExtend<<32:
|
||||
return grAny, grNoBoundary, 90
|
||||
case grAny | prZWJ<<32:
|
||||
return grAny, grNoBoundary, 90
|
||||
|
||||
// GB9a
|
||||
case grAny | prSpacingMark<<32:
|
||||
return grAny, grNoBoundary, 91
|
||||
|
||||
// GB9b
|
||||
case grAny | prPrepend<<32:
|
||||
return grPrepend, grBoundary, 9990
|
||||
case grPrepend | prAny<<32:
|
||||
return grAny, grNoBoundary, 92
|
||||
|
||||
// GB11
|
||||
case grAny | prExtendedPictographic<<32:
|
||||
return grExtendedPictographic, grBoundary, 9990
|
||||
case grExtendedPictographic | prExtend<<32:
|
||||
return grExtendedPictographic, grNoBoundary, 110
|
||||
case grExtendedPictographic | prZWJ<<32:
|
||||
return grExtendedPictographicZWJ, grNoBoundary, 110
|
||||
case grExtendedPictographicZWJ | prExtendedPictographic<<32:
|
||||
return grExtendedPictographic, grNoBoundary, 110
|
||||
|
||||
// GB12 / GB13
|
||||
case grAny | prRegionalIndicator<<32:
|
||||
return grRIOdd, grBoundary, 9990
|
||||
case grRIOdd | prRegionalIndicator<<32:
|
||||
return grRIEven, grNoBoundary, 120
|
||||
case grRIEven | prRegionalIndicator<<32:
|
||||
return grRIOdd, grBoundary, 120
|
||||
default:
|
||||
return -1, -1, -1
|
||||
}
|
||||
}
|
||||
|
||||
// transitionGraphemeState determines the new state of the grapheme cluster
|
||||
// parser given the current state and the next code point. It also returns the
|
||||
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
|
||||
// table) and whether a cluster boundary was detected.
|
||||
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
|
||||
// Determine the property of the next character.
|
||||
prop = propertyGraphemes(r)
|
||||
|
||||
// Find the applicable transition.
|
||||
nextState, nextProp, _ := grTransitions(state, prop)
|
||||
if nextState >= 0 {
|
||||
// We have a specific transition. We'll use it.
|
||||
return nextState, prop, nextProp == grBoundary
|
||||
}
|
||||
|
||||
// No specific transition found. Try the less specific ones.
|
||||
anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
|
||||
anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
|
||||
if anyPropState >= 0 && anyStateState >= 0 {
|
||||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||||
newState = anyStateState
|
||||
boundary = anyStateProp == grBoundary
|
||||
if anyPropRule < anyStateRule {
|
||||
boundary = anyPropProp == grBoundary
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if anyPropState >= 0 {
|
||||
// We only have a specific state.
|
||||
return anyPropState, prop, anyPropProp == grBoundary
|
||||
// This branch will probably never be reached because okAnyState will
|
||||
// always be true given the current transition map. But we keep it here
|
||||
// for future modifications to the transition map where this may not be
|
||||
// true anymore.
|
||||
}
|
||||
|
||||
if anyStateState >= 0 {
|
||||
// We only have a specific property.
|
||||
return anyStateState, prop, anyStateProp == grBoundary
|
||||
}
|
||||
|
||||
// No known transition. GB999: Any ÷ Any.
|
||||
return grAny, prop, true
|
||||
}
|
134
vendor/github.com/rivo/uniseg/line.go
generated
vendored
Normal file
134
vendor/github.com/rivo/uniseg/line.go
generated
vendored
Normal file
@ -0,0 +1,134 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// FirstLineSegment returns the prefix of the given byte slice after which a
|
||||
// decision to break the string over to the next line can or must be made,
|
||||
// according to the rules of [Unicode Standard Annex #14]. This is used to
|
||||
// implement line breaking.
|
||||
//
|
||||
// Line breaking, also known as word wrapping, is the process of breaking a
|
||||
// section of text into lines such that it will fit in the available width of a
|
||||
// page, window or other display area.
|
||||
//
|
||||
// The returned "segment" may not be broken into smaller parts, unless no other
|
||||
// breaking opportunities present themselves, in which case you may break by
|
||||
// grapheme clusters (using the [FirstGraphemeCluster] function to determine the
|
||||
// grapheme clusters).
|
||||
//
|
||||
// The "mustBreak" flag indicates whether you MUST break the line after the
|
||||
// given segment (true), for example after newline characters, or you MAY break
|
||||
// the line after the given segment (false).
|
||||
//
|
||||
// This function can be called continuously to extract all non-breaking sub-sets
|
||||
// from a byte slice, as illustrated in the example below.
|
||||
//
|
||||
// If you don't know the current state, for example when calling the function
|
||||
// for the first time, you must pass -1. For consecutive calls, pass the state
|
||||
// and rest slice returned by the previous call.
|
||||
//
|
||||
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
||||
// after the last byte of the identified line segment. If the length of the
|
||||
// "rest" slice is 0, the entire byte slice "b" has been processed. The
|
||||
// "segment" byte slice is the sub-slice of the input slice containing the
|
||||
// identified line segment.
|
||||
//
|
||||
// Given an empty byte slice "b", the function returns nil values.
|
||||
//
|
||||
// Note that in accordance with [UAX #14 LB3], the final segment will end with
|
||||
// "mustBreak" set to true. You can choose to ignore this by checking if the
|
||||
// length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
|
||||
// [HasTrailingLineBreakInString] on the last rune.
|
||||
//
|
||||
// Note also that this algorithm may break within grapheme clusters. This is
|
||||
// addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
|
||||
// the [Step] function instead.
|
||||
//
|
||||
// [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/
|
||||
// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
|
||||
func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(b) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRune(b)
|
||||
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return b, nil, true, lbAny // LB3.
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionLineBreakState(state, r, b[length:], "")
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary int
|
||||
for {
|
||||
r, l := utf8.DecodeRune(b[length:])
|
||||
state, boundary = transitionLineBreakState(state, r, b[length+l:], "")
|
||||
|
||||
if boundary != LineDontBreak {
|
||||
return b[:length], b[length:], boundary == LineMustBreak, state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(b) <= length {
|
||||
return b, nil, true, lbAny // LB3
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs
|
||||
// are strings.
|
||||
func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(str) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRuneInString(str)
|
||||
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return str, "", true, lbAny // LB3.
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionLineBreakState(state, r, nil, str[length:])
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary int
|
||||
for {
|
||||
r, l := utf8.DecodeRuneInString(str[length:])
|
||||
state, boundary = transitionLineBreakState(state, r, nil, str[length+l:])
|
||||
|
||||
if boundary != LineDontBreak {
|
||||
return str[:length], str[length:], boundary == LineMustBreak, state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(str) <= length {
|
||||
return str, "", true, lbAny // LB3.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HasTrailingLineBreak returns true if the last rune in the given byte slice is
|
||||
// one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
|
||||
//
|
||||
// [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
|
||||
func HasTrailingLineBreak(b []byte) bool {
|
||||
r, _ := utf8.DecodeLastRune(b)
|
||||
property, _ := propertyLineBreak(r)
|
||||
return property == prBK || property == prCR || property == prLF || property == prNL
|
||||
}
|
||||
|
||||
// HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
|
||||
func HasTrailingLineBreakInString(str string) bool {
|
||||
r, _ := utf8.DecodeLastRuneInString(str)
|
||||
property, _ := propertyLineBreak(r)
|
||||
return property == prBK || property == prCR || property == prLF || property == prNL
|
||||
}
|
3554
vendor/github.com/rivo/uniseg/lineproperties.go
generated
vendored
Normal file
3554
vendor/github.com/rivo/uniseg/lineproperties.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
626
vendor/github.com/rivo/uniseg/linerules.go
generated
vendored
Normal file
626
vendor/github.com/rivo/uniseg/linerules.go
generated
vendored
Normal file
@ -0,0 +1,626 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// The states of the line break parser.
|
||||
const (
|
||||
lbAny = iota
|
||||
lbBK
|
||||
lbCR
|
||||
lbLF
|
||||
lbNL
|
||||
lbSP
|
||||
lbZW
|
||||
lbWJ
|
||||
lbGL
|
||||
lbBA
|
||||
lbHY
|
||||
lbCL
|
||||
lbCP
|
||||
lbEX
|
||||
lbIS
|
||||
lbSY
|
||||
lbOP
|
||||
lbQU
|
||||
lbQUSP
|
||||
lbNS
|
||||
lbCLCPSP
|
||||
lbB2
|
||||
lbB2SP
|
||||
lbCB
|
||||
lbBB
|
||||
lbLB21a
|
||||
lbHL
|
||||
lbAL
|
||||
lbNU
|
||||
lbPR
|
||||
lbEB
|
||||
lbIDEM
|
||||
lbNUNU
|
||||
lbNUSY
|
||||
lbNUIS
|
||||
lbNUCL
|
||||
lbNUCP
|
||||
lbPO
|
||||
lbJL
|
||||
lbJV
|
||||
lbJT
|
||||
lbH2
|
||||
lbH3
|
||||
lbOddRI
|
||||
lbEvenRI
|
||||
lbExtPicCn
|
||||
lbZWJBit = 64
|
||||
lbCPeaFWHBit = 128
|
||||
)
|
||||
|
||||
// These constants define whether a given text may be broken into the next line.
|
||||
// If the break is optional (LineCanBreak), you may choose to break or not based
|
||||
// on your own criteria, for example, if the text has reached the available
|
||||
// width.
|
||||
const (
|
||||
LineDontBreak = iota // You may not break the line here.
|
||||
LineCanBreak // You may or may not break the line here.
|
||||
LineMustBreak // You must break the line here.
|
||||
)
|
||||
|
||||
// lbTransitions implements the line break parser's state transitions. It's
|
||||
// anologous to [grTransitions], see comments there for details.
|
||||
//
|
||||
// Unicode version 15.0.0.
|
||||
func lbTransitions(state, prop int) (newState, lineBreak, rule int) {
|
||||
switch uint64(state) | uint64(prop)<<32 {
|
||||
// LB4.
|
||||
case lbBK | prAny<<32:
|
||||
return lbAny, LineMustBreak, 40
|
||||
|
||||
// LB5.
|
||||
case lbCR | prLF<<32:
|
||||
return lbLF, LineDontBreak, 50
|
||||
case lbCR | prAny<<32:
|
||||
return lbAny, LineMustBreak, 50
|
||||
case lbLF | prAny<<32:
|
||||
return lbAny, LineMustBreak, 50
|
||||
case lbNL | prAny<<32:
|
||||
return lbAny, LineMustBreak, 50
|
||||
|
||||
// LB6.
|
||||
case lbAny | prBK<<32:
|
||||
return lbBK, LineDontBreak, 60
|
||||
case lbAny | prCR<<32:
|
||||
return lbCR, LineDontBreak, 60
|
||||
case lbAny | prLF<<32:
|
||||
return lbLF, LineDontBreak, 60
|
||||
case lbAny | prNL<<32:
|
||||
return lbNL, LineDontBreak, 60
|
||||
|
||||
// LB7.
|
||||
case lbAny | prSP<<32:
|
||||
return lbSP, LineDontBreak, 70
|
||||
case lbAny | prZW<<32:
|
||||
return lbZW, LineDontBreak, 70
|
||||
|
||||
// LB8.
|
||||
case lbZW | prSP<<32:
|
||||
return lbZW, LineDontBreak, 70
|
||||
case lbZW | prAny<<32:
|
||||
return lbAny, LineCanBreak, 80
|
||||
|
||||
// LB11.
|
||||
case lbAny | prWJ<<32:
|
||||
return lbWJ, LineDontBreak, 110
|
||||
case lbWJ | prAny<<32:
|
||||
return lbAny, LineDontBreak, 110
|
||||
|
||||
// LB12.
|
||||
case lbAny | prGL<<32:
|
||||
return lbGL, LineCanBreak, 310
|
||||
case lbGL | prAny<<32:
|
||||
return lbAny, LineDontBreak, 120
|
||||
|
||||
// LB13 (simple transitions).
|
||||
case lbAny | prCL<<32:
|
||||
return lbCL, LineCanBreak, 310
|
||||
case lbAny | prCP<<32:
|
||||
return lbCP, LineCanBreak, 310
|
||||
case lbAny | prEX<<32:
|
||||
return lbEX, LineDontBreak, 130
|
||||
case lbAny | prIS<<32:
|
||||
return lbIS, LineCanBreak, 310
|
||||
case lbAny | prSY<<32:
|
||||
return lbSY, LineCanBreak, 310
|
||||
|
||||
// LB14.
|
||||
case lbAny | prOP<<32:
|
||||
return lbOP, LineCanBreak, 310
|
||||
case lbOP | prSP<<32:
|
||||
return lbOP, LineDontBreak, 70
|
||||
case lbOP | prAny<<32:
|
||||
return lbAny, LineDontBreak, 140
|
||||
|
||||
// LB15.
|
||||
case lbQU | prSP<<32:
|
||||
return lbQUSP, LineDontBreak, 70
|
||||
case lbQU | prOP<<32:
|
||||
return lbOP, LineDontBreak, 150
|
||||
case lbQUSP | prOP<<32:
|
||||
return lbOP, LineDontBreak, 150
|
||||
|
||||
// LB16.
|
||||
case lbCL | prSP<<32:
|
||||
return lbCLCPSP, LineDontBreak, 70
|
||||
case lbNUCL | prSP<<32:
|
||||
return lbCLCPSP, LineDontBreak, 70
|
||||
case lbCP | prSP<<32:
|
||||
return lbCLCPSP, LineDontBreak, 70
|
||||
case lbNUCP | prSP<<32:
|
||||
return lbCLCPSP, LineDontBreak, 70
|
||||
case lbCL | prNS<<32:
|
||||
return lbNS, LineDontBreak, 160
|
||||
case lbNUCL | prNS<<32:
|
||||
return lbNS, LineDontBreak, 160
|
||||
case lbCP | prNS<<32:
|
||||
return lbNS, LineDontBreak, 160
|
||||
case lbNUCP | prNS<<32:
|
||||
return lbNS, LineDontBreak, 160
|
||||
case lbCLCPSP | prNS<<32:
|
||||
return lbNS, LineDontBreak, 160
|
||||
|
||||
// LB17.
|
||||
case lbAny | prB2<<32:
|
||||
return lbB2, LineCanBreak, 310
|
||||
case lbB2 | prSP<<32:
|
||||
return lbB2SP, LineDontBreak, 70
|
||||
case lbB2 | prB2<<32:
|
||||
return lbB2, LineDontBreak, 170
|
||||
case lbB2SP | prB2<<32:
|
||||
return lbB2, LineDontBreak, 170
|
||||
|
||||
// LB18.
|
||||
case lbSP | prAny<<32:
|
||||
return lbAny, LineCanBreak, 180
|
||||
case lbQUSP | prAny<<32:
|
||||
return lbAny, LineCanBreak, 180
|
||||
case lbCLCPSP | prAny<<32:
|
||||
return lbAny, LineCanBreak, 180
|
||||
case lbB2SP | prAny<<32:
|
||||
return lbAny, LineCanBreak, 180
|
||||
|
||||
// LB19.
|
||||
case lbAny | prQU<<32:
|
||||
return lbQU, LineDontBreak, 190
|
||||
case lbQU | prAny<<32:
|
||||
return lbAny, LineDontBreak, 190
|
||||
|
||||
// LB20.
|
||||
case lbAny | prCB<<32:
|
||||
return lbCB, LineCanBreak, 200
|
||||
case lbCB | prAny<<32:
|
||||
return lbAny, LineCanBreak, 200
|
||||
|
||||
// LB21.
|
||||
case lbAny | prBA<<32:
|
||||
return lbBA, LineDontBreak, 210
|
||||
case lbAny | prHY<<32:
|
||||
return lbHY, LineDontBreak, 210
|
||||
case lbAny | prNS<<32:
|
||||
return lbNS, LineDontBreak, 210
|
||||
case lbAny | prBB<<32:
|
||||
return lbBB, LineCanBreak, 310
|
||||
case lbBB | prAny<<32:
|
||||
return lbAny, LineDontBreak, 210
|
||||
|
||||
// LB21a.
|
||||
case lbAny | prHL<<32:
|
||||
return lbHL, LineCanBreak, 310
|
||||
case lbHL | prHY<<32:
|
||||
return lbLB21a, LineDontBreak, 210
|
||||
case lbHL | prBA<<32:
|
||||
return lbLB21a, LineDontBreak, 210
|
||||
case lbLB21a | prAny<<32:
|
||||
return lbAny, LineDontBreak, 211
|
||||
|
||||
// LB21b.
|
||||
case lbSY | prHL<<32:
|
||||
return lbHL, LineDontBreak, 212
|
||||
case lbNUSY | prHL<<32:
|
||||
return lbHL, LineDontBreak, 212
|
||||
|
||||
// LB22.
|
||||
case lbAny | prIN<<32:
|
||||
return lbAny, LineDontBreak, 220
|
||||
|
||||
// LB23.
|
||||
case lbAny | prAL<<32:
|
||||
return lbAL, LineCanBreak, 310
|
||||
case lbAny | prNU<<32:
|
||||
return lbNU, LineCanBreak, 310
|
||||
case lbAL | prNU<<32:
|
||||
return lbNU, LineDontBreak, 230
|
||||
case lbHL | prNU<<32:
|
||||
return lbNU, LineDontBreak, 230
|
||||
case lbNU | prAL<<32:
|
||||
return lbAL, LineDontBreak, 230
|
||||
case lbNU | prHL<<32:
|
||||
return lbHL, LineDontBreak, 230
|
||||
case lbNUNU | prAL<<32:
|
||||
return lbAL, LineDontBreak, 230
|
||||
case lbNUNU | prHL<<32:
|
||||
return lbHL, LineDontBreak, 230
|
||||
|
||||
// LB23a.
|
||||
case lbAny | prPR<<32:
|
||||
return lbPR, LineCanBreak, 310
|
||||
case lbAny | prID<<32:
|
||||
return lbIDEM, LineCanBreak, 310
|
||||
case lbAny | prEB<<32:
|
||||
return lbEB, LineCanBreak, 310
|
||||
case lbAny | prEM<<32:
|
||||
return lbIDEM, LineCanBreak, 310
|
||||
case lbPR | prID<<32:
|
||||
return lbIDEM, LineDontBreak, 231
|
||||
case lbPR | prEB<<32:
|
||||
return lbEB, LineDontBreak, 231
|
||||
case lbPR | prEM<<32:
|
||||
return lbIDEM, LineDontBreak, 231
|
||||
case lbIDEM | prPO<<32:
|
||||
return lbPO, LineDontBreak, 231
|
||||
case lbEB | prPO<<32:
|
||||
return lbPO, LineDontBreak, 231
|
||||
|
||||
// LB24.
|
||||
case lbAny | prPO<<32:
|
||||
return lbPO, LineCanBreak, 310
|
||||
case lbPR | prAL<<32:
|
||||
return lbAL, LineDontBreak, 240
|
||||
case lbPR | prHL<<32:
|
||||
return lbHL, LineDontBreak, 240
|
||||
case lbPO | prAL<<32:
|
||||
return lbAL, LineDontBreak, 240
|
||||
case lbPO | prHL<<32:
|
||||
return lbHL, LineDontBreak, 240
|
||||
case lbAL | prPR<<32:
|
||||
return lbPR, LineDontBreak, 240
|
||||
case lbAL | prPO<<32:
|
||||
return lbPO, LineDontBreak, 240
|
||||
case lbHL | prPR<<32:
|
||||
return lbPR, LineDontBreak, 240
|
||||
case lbHL | prPO<<32:
|
||||
return lbPO, LineDontBreak, 240
|
||||
|
||||
// LB25 (simple transitions).
|
||||
case lbPR | prNU<<32:
|
||||
return lbNU, LineDontBreak, 250
|
||||
case lbPO | prNU<<32:
|
||||
return lbNU, LineDontBreak, 250
|
||||
case lbOP | prNU<<32:
|
||||
return lbNU, LineDontBreak, 250
|
||||
case lbHY | prNU<<32:
|
||||
return lbNU, LineDontBreak, 250
|
||||
case lbNU | prNU<<32:
|
||||
return lbNUNU, LineDontBreak, 250
|
||||
case lbNU | prSY<<32:
|
||||
return lbNUSY, LineDontBreak, 250
|
||||
case lbNU | prIS<<32:
|
||||
return lbNUIS, LineDontBreak, 250
|
||||
case lbNUNU | prNU<<32:
|
||||
return lbNUNU, LineDontBreak, 250
|
||||
case lbNUNU | prSY<<32:
|
||||
return lbNUSY, LineDontBreak, 250
|
||||
case lbNUNU | prIS<<32:
|
||||
return lbNUIS, LineDontBreak, 250
|
||||
case lbNUSY | prNU<<32:
|
||||
return lbNUNU, LineDontBreak, 250
|
||||
case lbNUSY | prSY<<32:
|
||||
return lbNUSY, LineDontBreak, 250
|
||||
case lbNUSY | prIS<<32:
|
||||
return lbNUIS, LineDontBreak, 250
|
||||
case lbNUIS | prNU<<32:
|
||||
return lbNUNU, LineDontBreak, 250
|
||||
case lbNUIS | prSY<<32:
|
||||
return lbNUSY, LineDontBreak, 250
|
||||
case lbNUIS | prIS<<32:
|
||||
return lbNUIS, LineDontBreak, 250
|
||||
case lbNU | prCL<<32:
|
||||
return lbNUCL, LineDontBreak, 250
|
||||
case lbNU | prCP<<32:
|
||||
return lbNUCP, LineDontBreak, 250
|
||||
case lbNUNU | prCL<<32:
|
||||
return lbNUCL, LineDontBreak, 250
|
||||
case lbNUNU | prCP<<32:
|
||||
return lbNUCP, LineDontBreak, 250
|
||||
case lbNUSY | prCL<<32:
|
||||
return lbNUCL, LineDontBreak, 250
|
||||
case lbNUSY | prCP<<32:
|
||||
return lbNUCP, LineDontBreak, 250
|
||||
case lbNUIS | prCL<<32:
|
||||
return lbNUCL, LineDontBreak, 250
|
||||
case lbNUIS | prCP<<32:
|
||||
return lbNUCP, LineDontBreak, 250
|
||||
case lbNU | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNUNU | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNUSY | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNUIS | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNUCL | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNUCP | prPO<<32:
|
||||
return lbPO, LineDontBreak, 250
|
||||
case lbNU | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
case lbNUNU | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
case lbNUSY | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
case lbNUIS | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
case lbNUCL | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
case lbNUCP | prPR<<32:
|
||||
return lbPR, LineDontBreak, 250
|
||||
|
||||
// LB26.
|
||||
case lbAny | prJL<<32:
|
||||
return lbJL, LineCanBreak, 310
|
||||
case lbAny | prJV<<32:
|
||||
return lbJV, LineCanBreak, 310
|
||||
case lbAny | prJT<<32:
|
||||
return lbJT, LineCanBreak, 310
|
||||
case lbAny | prH2<<32:
|
||||
return lbH2, LineCanBreak, 310
|
||||
case lbAny | prH3<<32:
|
||||
return lbH3, LineCanBreak, 310
|
||||
case lbJL | prJL<<32:
|
||||
return lbJL, LineDontBreak, 260
|
||||
case lbJL | prJV<<32:
|
||||
return lbJV, LineDontBreak, 260
|
||||
case lbJL | prH2<<32:
|
||||
return lbH2, LineDontBreak, 260
|
||||
case lbJL | prH3<<32:
|
||||
return lbH3, LineDontBreak, 260
|
||||
case lbJV | prJV<<32:
|
||||
return lbJV, LineDontBreak, 260
|
||||
case lbJV | prJT<<32:
|
||||
return lbJT, LineDontBreak, 260
|
||||
case lbH2 | prJV<<32:
|
||||
return lbJV, LineDontBreak, 260
|
||||
case lbH2 | prJT<<32:
|
||||
return lbJT, LineDontBreak, 260
|
||||
case lbJT | prJT<<32:
|
||||
return lbJT, LineDontBreak, 260
|
||||
case lbH3 | prJT<<32:
|
||||
return lbJT, LineDontBreak, 260
|
||||
|
||||
// LB27.
|
||||
case lbJL | prPO<<32:
|
||||
return lbPO, LineDontBreak, 270
|
||||
case lbJV | prPO<<32:
|
||||
return lbPO, LineDontBreak, 270
|
||||
case lbJT | prPO<<32:
|
||||
return lbPO, LineDontBreak, 270
|
||||
case lbH2 | prPO<<32:
|
||||
return lbPO, LineDontBreak, 270
|
||||
case lbH3 | prPO<<32:
|
||||
return lbPO, LineDontBreak, 270
|
||||
case lbPR | prJL<<32:
|
||||
return lbJL, LineDontBreak, 270
|
||||
case lbPR | prJV<<32:
|
||||
return lbJV, LineDontBreak, 270
|
||||
case lbPR | prJT<<32:
|
||||
return lbJT, LineDontBreak, 270
|
||||
case lbPR | prH2<<32:
|
||||
return lbH2, LineDontBreak, 270
|
||||
case lbPR | prH3<<32:
|
||||
return lbH3, LineDontBreak, 270
|
||||
|
||||
// LB28.
|
||||
case lbAL | prAL<<32:
|
||||
return lbAL, LineDontBreak, 280
|
||||
case lbAL | prHL<<32:
|
||||
return lbHL, LineDontBreak, 280
|
||||
case lbHL | prAL<<32:
|
||||
return lbAL, LineDontBreak, 280
|
||||
case lbHL | prHL<<32:
|
||||
return lbHL, LineDontBreak, 280
|
||||
|
||||
// LB29.
|
||||
case lbIS | prAL<<32:
|
||||
return lbAL, LineDontBreak, 290
|
||||
case lbIS | prHL<<32:
|
||||
return lbHL, LineDontBreak, 290
|
||||
case lbNUIS | prAL<<32:
|
||||
return lbAL, LineDontBreak, 290
|
||||
case lbNUIS | prHL<<32:
|
||||
return lbHL, LineDontBreak, 290
|
||||
|
||||
default:
|
||||
return -1, -1, -1
|
||||
}
|
||||
}
|
||||
|
||||
// transitionLineBreakState determines the new state of the line break parser
|
||||
// given the current state and the next code point. It also returns the type of
|
||||
// line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one
|
||||
// code point is needed to determine the new state, the byte slice or the string
|
||||
// starting after rune "r" can be used (whichever is not nil or empty) for
|
||||
// further lookups.
|
||||
func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
|
||||
// Determine the property of the next character.
|
||||
nextProperty, generalCategory := propertyLineBreak(r)
|
||||
|
||||
// Prepare.
|
||||
var forceNoBreak, isCPeaFWH bool
|
||||
if state >= 0 && state&lbCPeaFWHBit != 0 {
|
||||
isCPeaFWH = true // LB30: CP but ea is not F, W, or H.
|
||||
state = state &^ lbCPeaFWHBit
|
||||
}
|
||||
if state >= 0 && state&lbZWJBit != 0 {
|
||||
state = state &^ lbZWJBit // Extract zero-width joiner bit.
|
||||
forceNoBreak = true // LB8a.
|
||||
}
|
||||
|
||||
defer func() {
|
||||
// Transition into LB30.
|
||||
if newState == lbCP || newState == lbNUCP {
|
||||
ea := propertyEastAsianWidth(r)
|
||||
if ea != prF && ea != prW && ea != prH {
|
||||
newState |= lbCPeaFWHBit
|
||||
}
|
||||
}
|
||||
|
||||
// Override break.
|
||||
if forceNoBreak {
|
||||
lineBreak = LineDontBreak
|
||||
}
|
||||
}()
|
||||
|
||||
// LB1.
|
||||
if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX {
|
||||
nextProperty = prAL
|
||||
} else if nextProperty == prSA {
|
||||
if generalCategory == gcMn || generalCategory == gcMc {
|
||||
nextProperty = prCM
|
||||
} else {
|
||||
nextProperty = prAL
|
||||
}
|
||||
} else if nextProperty == prCJ {
|
||||
nextProperty = prNS
|
||||
}
|
||||
|
||||
// Combining marks.
|
||||
if nextProperty == prZWJ || nextProperty == prCM {
|
||||
var bit int
|
||||
if nextProperty == prZWJ {
|
||||
bit = lbZWJBit
|
||||
}
|
||||
mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL
|
||||
if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP {
|
||||
// LB9.
|
||||
return state | bit, LineDontBreak
|
||||
} else {
|
||||
// LB10.
|
||||
if mustBreakState {
|
||||
return lbAL | bit, LineMustBreak
|
||||
}
|
||||
return lbAL | bit, LineCanBreak
|
||||
}
|
||||
}
|
||||
|
||||
// Find the applicable transition in the table.
|
||||
var rule int
|
||||
newState, lineBreak, rule = lbTransitions(state, nextProperty)
|
||||
if newState < 0 {
|
||||
// No specific transition found. Try the less specific ones.
|
||||
anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny)
|
||||
anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty)
|
||||
if anyPropProp >= 0 && anyStateProp >= 0 {
|
||||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||||
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
|
||||
if anyPropRule < anyStateRule {
|
||||
lineBreak, rule = anyPropLineBreak, anyPropRule
|
||||
}
|
||||
} else if anyPropProp >= 0 {
|
||||
// We only have a specific state.
|
||||
newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule
|
||||
// This branch will probably never be reached because okAnyState will
|
||||
// always be true given the current transition map. But we keep it here
|
||||
// for future modifications to the transition map where this may not be
|
||||
// true anymore.
|
||||
} else if anyStateProp >= 0 {
|
||||
// We only have a specific property.
|
||||
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
|
||||
} else {
|
||||
// No known transition. LB31: ALL ÷ ALL.
|
||||
newState, lineBreak, rule = lbAny, LineCanBreak, 310
|
||||
}
|
||||
}
|
||||
|
||||
// LB12a.
|
||||
if rule > 121 &&
|
||||
nextProperty == prGL &&
|
||||
(state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) {
|
||||
return lbGL, LineDontBreak
|
||||
}
|
||||
|
||||
// LB13.
|
||||
if rule > 130 && state != lbNU && state != lbNUNU {
|
||||
switch nextProperty {
|
||||
case prCL:
|
||||
return lbCL, LineDontBreak
|
||||
case prCP:
|
||||
return lbCP, LineDontBreak
|
||||
case prIS:
|
||||
return lbIS, LineDontBreak
|
||||
case prSY:
|
||||
return lbSY, LineDontBreak
|
||||
}
|
||||
}
|
||||
|
||||
// LB25 (look ahead).
|
||||
if rule > 250 &&
|
||||
(state == lbPR || state == lbPO) &&
|
||||
nextProperty == prOP || nextProperty == prHY {
|
||||
var r rune
|
||||
if b != nil { // Byte slice version.
|
||||
r, _ = utf8.DecodeRune(b)
|
||||
} else { // String version.
|
||||
r, _ = utf8.DecodeRuneInString(str)
|
||||
}
|
||||
if r != utf8.RuneError {
|
||||
pr, _ := propertyLineBreak(r)
|
||||
if pr == prNU {
|
||||
return lbNU, LineDontBreak
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LB30 (part one).
|
||||
if rule > 300 {
|
||||
if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
|
||||
ea := propertyEastAsianWidth(r)
|
||||
if ea != prF && ea != prW && ea != prH {
|
||||
return lbOP, LineDontBreak
|
||||
}
|
||||
} else if isCPeaFWH {
|
||||
switch nextProperty {
|
||||
case prAL:
|
||||
return lbAL, LineDontBreak
|
||||
case prHL:
|
||||
return lbHL, LineDontBreak
|
||||
case prNU:
|
||||
return lbNU, LineDontBreak
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LB30a.
|
||||
if newState == lbAny && nextProperty == prRI {
|
||||
if state != lbOddRI && state != lbEvenRI { // Includes state == -1.
|
||||
// Transition into the first RI.
|
||||
return lbOddRI, lineBreak
|
||||
}
|
||||
if state == lbOddRI {
|
||||
// Don't break pairs of Regional Indicators.
|
||||
return lbEvenRI, LineDontBreak
|
||||
}
|
||||
return lbOddRI, lineBreak
|
||||
}
|
||||
|
||||
// LB30b.
|
||||
if rule > 302 {
|
||||
if nextProperty == prEM {
|
||||
if state == lbEB || state == lbExtPicCn {
|
||||
return prAny, LineDontBreak
|
||||
}
|
||||
}
|
||||
graphemeProperty := propertyGraphemes(r)
|
||||
if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
|
||||
return lbExtPicCn, LineCanBreak
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
208
vendor/github.com/rivo/uniseg/properties.go
generated
vendored
Normal file
208
vendor/github.com/rivo/uniseg/properties.go
generated
vendored
Normal file
@ -0,0 +1,208 @@
|
||||
package uniseg
|
||||
|
||||
// The Unicode properties as used in the various parsers. Only the ones needed
|
||||
// in the context of this package are included.
|
||||
const (
|
||||
prXX = 0 // Same as prAny.
|
||||
prAny = iota // prAny must be 0.
|
||||
prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
|
||||
prCR
|
||||
prLF
|
||||
prControl
|
||||
prExtend
|
||||
prRegionalIndicator
|
||||
prSpacingMark
|
||||
prL
|
||||
prV
|
||||
prT
|
||||
prLV
|
||||
prLVT
|
||||
prZWJ
|
||||
prExtendedPictographic
|
||||
prNewline
|
||||
prWSegSpace
|
||||
prDoubleQuote
|
||||
prSingleQuote
|
||||
prMidNumLet
|
||||
prNumeric
|
||||
prMidLetter
|
||||
prMidNum
|
||||
prExtendNumLet
|
||||
prALetter
|
||||
prFormat
|
||||
prHebrewLetter
|
||||
prKatakana
|
||||
prSp
|
||||
prSTerm
|
||||
prClose
|
||||
prSContinue
|
||||
prATerm
|
||||
prUpper
|
||||
prLower
|
||||
prSep
|
||||
prOLetter
|
||||
prCM
|
||||
prBA
|
||||
prBK
|
||||
prSP
|
||||
prEX
|
||||
prQU
|
||||
prAL
|
||||
prPR
|
||||
prPO
|
||||
prOP
|
||||
prCP
|
||||
prIS
|
||||
prHY
|
||||
prSY
|
||||
prNU
|
||||
prCL
|
||||
prNL
|
||||
prGL
|
||||
prAI
|
||||
prBB
|
||||
prHL
|
||||
prSA
|
||||
prJL
|
||||
prJV
|
||||
prJT
|
||||
prNS
|
||||
prZW
|
||||
prB2
|
||||
prIN
|
||||
prWJ
|
||||
prID
|
||||
prEB
|
||||
prCJ
|
||||
prH2
|
||||
prH3
|
||||
prSG
|
||||
prCB
|
||||
prRI
|
||||
prEM
|
||||
prN
|
||||
prNa
|
||||
prA
|
||||
prW
|
||||
prH
|
||||
prF
|
||||
prEmojiPresentation
|
||||
)
|
||||
|
||||
// Unicode General Categories. Only the ones needed in the context of this
|
||||
// package are included.
|
||||
const (
|
||||
gcNone = iota // gcNone must be 0.
|
||||
gcCc
|
||||
gcZs
|
||||
gcPo
|
||||
gcSc
|
||||
gcPs
|
||||
gcPe
|
||||
gcSm
|
||||
gcPd
|
||||
gcNd
|
||||
gcLu
|
||||
gcSk
|
||||
gcPc
|
||||
gcLl
|
||||
gcSo
|
||||
gcLo
|
||||
gcPi
|
||||
gcCf
|
||||
gcNo
|
||||
gcPf
|
||||
gcLC
|
||||
gcLm
|
||||
gcMn
|
||||
gcMe
|
||||
gcMc
|
||||
gcNl
|
||||
gcZl
|
||||
gcZp
|
||||
gcCn
|
||||
gcCs
|
||||
gcCo
|
||||
)
|
||||
|
||||
// Special code points.
|
||||
const (
|
||||
vs15 = 0xfe0e // Variation Selector-15 (text presentation)
|
||||
vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
|
||||
)
|
||||
|
||||
// propertySearch performs a binary search on a property slice and returns the
|
||||
// entry whose range (start = first array element, end = second array element)
|
||||
// includes r, or an array of 0's if no such entry was found.
|
||||
func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
|
||||
// Run a binary search.
|
||||
from := 0
|
||||
to := len(dictionary)
|
||||
for to > from {
|
||||
middle := (from + to) / 2
|
||||
cpRange := dictionary[middle]
|
||||
if int(r) < cpRange[0] {
|
||||
to = middle
|
||||
continue
|
||||
}
|
||||
if int(r) > cpRange[1] {
|
||||
from = middle + 1
|
||||
continue
|
||||
}
|
||||
return cpRange
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// property returns the Unicode property value (see constants above) of the
|
||||
// given code point.
|
||||
func property(dictionary [][3]int, r rune) int {
|
||||
return propertySearch(dictionary, r)[2]
|
||||
}
|
||||
|
||||
// propertyLineBreak returns the Unicode property value and General Category
|
||||
// (see constants above) of the given code point, as listed in the line break
|
||||
// code points table, while fast tracking ASCII digits and letters.
|
||||
func propertyLineBreak(r rune) (property, generalCategory int) {
|
||||
if r >= 'a' && r <= 'z' {
|
||||
return prAL, gcLl
|
||||
}
|
||||
if r >= 'A' && r <= 'Z' {
|
||||
return prAL, gcLu
|
||||
}
|
||||
if r >= '0' && r <= '9' {
|
||||
return prNU, gcNd
|
||||
}
|
||||
entry := propertySearch(lineBreakCodePoints, r)
|
||||
return entry[2], entry[3]
|
||||
}
|
||||
|
||||
// propertyGraphemes returns the Unicode grapheme cluster property value of the
|
||||
// given code point while fast tracking ASCII characters.
|
||||
func propertyGraphemes(r rune) int {
|
||||
if r >= 0x20 && r <= 0x7e {
|
||||
return prAny
|
||||
}
|
||||
if r == 0x0a {
|
||||
return prLF
|
||||
}
|
||||
if r == 0x0d {
|
||||
return prCR
|
||||
}
|
||||
if r >= 0 && r <= 0x1f || r == 0x7f {
|
||||
return prControl
|
||||
}
|
||||
return property(graphemeCodePoints, r)
|
||||
}
|
||||
|
||||
// propertyEastAsianWidth returns the Unicode East Asian Width property value of
|
||||
// the given code point while fast tracking ASCII characters.
|
||||
func propertyEastAsianWidth(r rune) int {
|
||||
if r >= 0x20 && r <= 0x7e {
|
||||
return prNa
|
||||
}
|
||||
if r >= 0 && r <= 0x1f || r == 0x7f {
|
||||
return prN
|
||||
}
|
||||
return property(eastAsianWidth, r)
|
||||
}
|
90
vendor/github.com/rivo/uniseg/sentence.go
generated
vendored
Normal file
90
vendor/github.com/rivo/uniseg/sentence.go
generated
vendored
Normal file
@ -0,0 +1,90 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// FirstSentence returns the first sentence found in the given byte slice
|
||||
// according to the rules of [Unicode Standard Annex #29, Sentence Boundaries].
|
||||
// This function can be called continuously to extract all sentences from a byte
|
||||
// slice, as illustrated in the example below.
|
||||
//
|
||||
// If you don't know the current state, for example when calling the function
|
||||
// for the first time, you must pass -1. For consecutive calls, pass the state
|
||||
// and rest slice returned by the previous call.
|
||||
//
|
||||
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
||||
// after the last byte of the identified sentence. If the length of the "rest"
|
||||
// slice is 0, the entire byte slice "b" has been processed. The "sentence" byte
|
||||
// slice is the sub-slice of the input slice containing the identified sentence.
|
||||
//
|
||||
// Given an empty byte slice "b", the function returns nil values.
|
||||
//
|
||||
// [Unicode Standard Annex #29, Sentence Boundaries]: http://unicode.org/reports/tr29/#Sentence_Boundaries
|
||||
func FirstSentence(b []byte, state int) (sentence, rest []byte, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(b) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRune(b)
|
||||
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return b, nil, sbAny
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionSentenceBreakState(state, r, b[length:], "")
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary bool
|
||||
for {
|
||||
r, l := utf8.DecodeRune(b[length:])
|
||||
state, boundary = transitionSentenceBreakState(state, r, b[length+l:], "")
|
||||
|
||||
if boundary {
|
||||
return b[:length], b[length:], state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(b) <= length {
|
||||
return b, nil, sbAny
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FirstSentenceInString is like [FirstSentence] but its input and outputs are
|
||||
// strings.
|
||||
func FirstSentenceInString(str string, state int) (sentence, rest string, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(str) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRuneInString(str)
|
||||
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return str, "", sbAny
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionSentenceBreakState(state, r, nil, str[length:])
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary bool
|
||||
for {
|
||||
r, l := utf8.DecodeRuneInString(str[length:])
|
||||
state, boundary = transitionSentenceBreakState(state, r, nil, str[length+l:])
|
||||
|
||||
if boundary {
|
||||
return str[:length], str[length:], state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(str) <= length {
|
||||
return str, "", sbAny
|
||||
}
|
||||
}
|
||||
}
|
2845
vendor/github.com/rivo/uniseg/sentenceproperties.go
generated
vendored
Normal file
2845
vendor/github.com/rivo/uniseg/sentenceproperties.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
276
vendor/github.com/rivo/uniseg/sentencerules.go
generated
vendored
Normal file
276
vendor/github.com/rivo/uniseg/sentencerules.go
generated
vendored
Normal file
@ -0,0 +1,276 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// The states of the sentence break parser.
|
||||
const (
|
||||
sbAny = iota
|
||||
sbCR
|
||||
sbParaSep
|
||||
sbATerm
|
||||
sbUpper
|
||||
sbLower
|
||||
sbSB7
|
||||
sbSB8Close
|
||||
sbSB8Sp
|
||||
sbSTerm
|
||||
sbSB8aClose
|
||||
sbSB8aSp
|
||||
)
|
||||
|
||||
// sbTransitions implements the sentence break parser's state transitions. It's
|
||||
// anologous to [grTransitions], see comments there for details.
|
||||
//
|
||||
// Unicode version 15.0.0.
|
||||
func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
|
||||
switch uint64(state) | uint64(prop)<<32 {
|
||||
// SB3.
|
||||
case sbAny | prCR<<32:
|
||||
return sbCR, false, 9990
|
||||
case sbCR | prLF<<32:
|
||||
return sbParaSep, false, 30
|
||||
|
||||
// SB4.
|
||||
case sbAny | prSep<<32:
|
||||
return sbParaSep, false, 9990
|
||||
case sbAny | prLF<<32:
|
||||
return sbParaSep, false, 9990
|
||||
case sbParaSep | prAny<<32:
|
||||
return sbAny, true, 40
|
||||
case sbCR | prAny<<32:
|
||||
return sbAny, true, 40
|
||||
|
||||
// SB6.
|
||||
case sbAny | prATerm<<32:
|
||||
return sbATerm, false, 9990
|
||||
case sbATerm | prNumeric<<32:
|
||||
return sbAny, false, 60
|
||||
case sbSB7 | prNumeric<<32:
|
||||
return sbAny, false, 60 // Because ATerm also appears in SB7.
|
||||
|
||||
// SB7.
|
||||
case sbAny | prUpper<<32:
|
||||
return sbUpper, false, 9990
|
||||
case sbAny | prLower<<32:
|
||||
return sbLower, false, 9990
|
||||
case sbUpper | prATerm<<32:
|
||||
return sbSB7, false, 70
|
||||
case sbLower | prATerm<<32:
|
||||
return sbSB7, false, 70
|
||||
case sbSB7 | prUpper<<32:
|
||||
return sbUpper, false, 70
|
||||
|
||||
// SB8a.
|
||||
case sbAny | prSTerm<<32:
|
||||
return sbSTerm, false, 9990
|
||||
case sbATerm | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbATerm | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbATerm | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSB7 | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSB7 | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSB7 | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSB8Close | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSB8Close | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSB8Close | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSB8Sp | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSB8Sp | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSB8Sp | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSTerm | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSTerm | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSTerm | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSB8aClose | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSB8aClose | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSB8aClose | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
case sbSB8aSp | prSContinue<<32:
|
||||
return sbAny, false, 81
|
||||
case sbSB8aSp | prATerm<<32:
|
||||
return sbATerm, false, 81
|
||||
case sbSB8aSp | prSTerm<<32:
|
||||
return sbSTerm, false, 81
|
||||
|
||||
// SB9.
|
||||
case sbATerm | prClose<<32:
|
||||
return sbSB8Close, false, 90
|
||||
case sbSB7 | prClose<<32:
|
||||
return sbSB8Close, false, 90
|
||||
case sbSB8Close | prClose<<32:
|
||||
return sbSB8Close, false, 90
|
||||
case sbATerm | prSp<<32:
|
||||
return sbSB8Sp, false, 90
|
||||
case sbSB7 | prSp<<32:
|
||||
return sbSB8Sp, false, 90
|
||||
case sbSB8Close | prSp<<32:
|
||||
return sbSB8Sp, false, 90
|
||||
case sbSTerm | prClose<<32:
|
||||
return sbSB8aClose, false, 90
|
||||
case sbSB8aClose | prClose<<32:
|
||||
return sbSB8aClose, false, 90
|
||||
case sbSTerm | prSp<<32:
|
||||
return sbSB8aSp, false, 90
|
||||
case sbSB8aClose | prSp<<32:
|
||||
return sbSB8aSp, false, 90
|
||||
case sbATerm | prSep<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbATerm | prCR<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbATerm | prLF<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB7 | prSep<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB7 | prCR<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB7 | prLF<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8Close | prSep<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8Close | prCR<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8Close | prLF<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSTerm | prSep<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSTerm | prCR<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSTerm | prLF<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8aClose | prSep<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8aClose | prCR<<32:
|
||||
return sbParaSep, false, 90
|
||||
case sbSB8aClose | prLF<<32:
|
||||
return sbParaSep, false, 90
|
||||
|
||||
// SB10.
|
||||
case sbSB8Sp | prSp<<32:
|
||||
return sbSB8Sp, false, 100
|
||||
case sbSB8aSp | prSp<<32:
|
||||
return sbSB8aSp, false, 100
|
||||
case sbSB8Sp | prSep<<32:
|
||||
return sbParaSep, false, 100
|
||||
case sbSB8Sp | prCR<<32:
|
||||
return sbParaSep, false, 100
|
||||
case sbSB8Sp | prLF<<32:
|
||||
return sbParaSep, false, 100
|
||||
|
||||
// SB11.
|
||||
case sbATerm | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSB7 | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSB8Close | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSB8Sp | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSTerm | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSB8aClose | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
case sbSB8aSp | prAny<<32:
|
||||
return sbAny, true, 110
|
||||
// We'll always break after ParaSep due to SB4.
|
||||
|
||||
default:
|
||||
return -1, false, -1
|
||||
}
|
||||
}
|
||||
|
||||
// transitionSentenceBreakState determines the new state of the sentence break
|
||||
// parser given the current state and the next code point. It also returns
|
||||
// whether a sentence boundary was detected. If more than one code point is
|
||||
// needed to determine the new state, the byte slice or the string starting
|
||||
// after rune "r" can be used (whichever is not nil or empty) for further
|
||||
// lookups.
|
||||
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
|
||||
// Determine the property of the next character.
|
||||
nextProperty := property(sentenceBreakCodePoints, r)
|
||||
|
||||
// SB5 (Replacing Ignore Rules).
|
||||
if nextProperty == prExtend || nextProperty == prFormat {
|
||||
if state == sbParaSep || state == sbCR {
|
||||
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
|
||||
}
|
||||
if state < 0 {
|
||||
return sbAny, true // SB1.
|
||||
}
|
||||
return state, false
|
||||
}
|
||||
|
||||
// Find the applicable transition in the table.
|
||||
var rule int
|
||||
newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
|
||||
if newState < 0 {
|
||||
// No specific transition found. Try the less specific ones.
|
||||
anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
|
||||
anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
|
||||
if anyPropState >= 0 && anyStateState >= 0 {
|
||||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||||
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
||||
if anyPropRule < anyStateRule {
|
||||
sentenceBreak, rule = anyPropProp, anyPropRule
|
||||
}
|
||||
} else if anyPropState >= 0 {
|
||||
// We only have a specific state.
|
||||
newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
|
||||
// This branch will probably never be reached because okAnyState will
|
||||
// always be true given the current transition map. But we keep it here
|
||||
// for future modifications to the transition map where this may not be
|
||||
// true anymore.
|
||||
} else if anyStateState >= 0 {
|
||||
// We only have a specific property.
|
||||
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
|
||||
} else {
|
||||
// No known transition. SB999: Any × Any.
|
||||
newState, sentenceBreak, rule = sbAny, false, 9990
|
||||
}
|
||||
}
|
||||
|
||||
// SB8.
|
||||
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
|
||||
// Check the right side of the rule.
|
||||
var length int
|
||||
for nextProperty != prOLetter &&
|
||||
nextProperty != prUpper &&
|
||||
nextProperty != prLower &&
|
||||
nextProperty != prSep &&
|
||||
nextProperty != prCR &&
|
||||
nextProperty != prLF &&
|
||||
nextProperty != prATerm &&
|
||||
nextProperty != prSTerm {
|
||||
// Move on to the next rune.
|
||||
if b != nil { // Byte slice version.
|
||||
r, length = utf8.DecodeRune(b)
|
||||
b = b[length:]
|
||||
} else { // String version.
|
||||
r, length = utf8.DecodeRuneInString(str)
|
||||
str = str[length:]
|
||||
}
|
||||
if r == utf8.RuneError {
|
||||
break
|
||||
}
|
||||
nextProperty = property(sentenceBreakCodePoints, r)
|
||||
}
|
||||
if nextProperty == prLower {
|
||||
return sbLower, false
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
242
vendor/github.com/rivo/uniseg/step.go
generated
vendored
Normal file
242
vendor/github.com/rivo/uniseg/step.go
generated
vendored
Normal file
@ -0,0 +1,242 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// The bit masks used to extract boundary information returned by [Step].
|
||||
const (
|
||||
MaskLine = 3
|
||||
MaskWord = 4
|
||||
MaskSentence = 8
|
||||
)
|
||||
|
||||
// The number of bits to shift the boundary information returned by [Step] to
|
||||
// obtain the monospace width of the grapheme cluster.
|
||||
const ShiftWidth = 4
|
||||
|
||||
// The bit positions by which boundary flags are shifted by the [Step] function.
|
||||
// These must correspond to the Mask constants.
|
||||
const (
|
||||
shiftWord = 2
|
||||
shiftSentence = 3
|
||||
// shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
|
||||
)
|
||||
|
||||
// The bit positions by which states are shifted by the [Step] function. These
|
||||
// values must ensure state values defined for each of the boundary algorithms
|
||||
// don't overlap (and that they all still fit in a single int). These must
|
||||
// correspond to the Mask constants.
|
||||
const (
|
||||
shiftWordState = 4
|
||||
shiftSentenceState = 9
|
||||
shiftLineState = 13
|
||||
shiftPropState = 21 // No mask as these are always the remaining bits.
|
||||
)
|
||||
|
||||
// The bit mask used to extract the state returned by the [Step] function, after
|
||||
// shifting. These values must correspond to the shift constants.
|
||||
const (
|
||||
maskGraphemeState = 0xf
|
||||
maskWordState = 0x1f
|
||||
maskSentenceState = 0xf
|
||||
maskLineState = 0xff
|
||||
)
|
||||
|
||||
// Step returns the first grapheme cluster (user-perceived character) found in
|
||||
// the given byte slice. It also returns information about the boundary between
|
||||
// that grapheme cluster and the one following it as well as the monospace width
|
||||
// of the grapheme cluster. There are three types of boundary information: word
|
||||
// boundaries, sentence boundaries, and line breaks. This function is therefore
|
||||
// a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
|
||||
// [FirstLineSegment].
|
||||
//
|
||||
// The "boundaries" return value can be evaluated as follows:
|
||||
//
|
||||
// - boundaries&MaskWord != 0: The boundary is a word boundary.
|
||||
// - boundaries&MaskWord == 0: The boundary is not a word boundary.
|
||||
// - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
|
||||
// - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
|
||||
// - boundaries&MaskLine == LineDontBreak: You must not break the line at the
|
||||
// boundary.
|
||||
// - boundaries&MaskLine == LineMustBreak: You must break the line at the
|
||||
// boundary.
|
||||
// - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
|
||||
// the boundary.
|
||||
// - boundaries >> ShiftWidth: The width of the grapheme cluster for most
|
||||
// monospace fonts where a value of 1 represents one character cell.
|
||||
//
|
||||
// This function can be called continuously to extract all grapheme clusters
|
||||
// from a byte slice, as illustrated in the examples below.
|
||||
//
|
||||
// If you don't know which state to pass, for example when calling the function
|
||||
// for the first time, you must pass -1. For consecutive calls, pass the state
|
||||
// and rest slice returned by the previous call.
|
||||
//
|
||||
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
||||
// after the last byte of the identified grapheme cluster. If the length of the
|
||||
// "rest" slice is 0, the entire byte slice "b" has been processed. The
|
||||
// "cluster" byte slice is the sub-slice of the input slice containing the
|
||||
// first identified grapheme cluster.
|
||||
//
|
||||
// Given an empty byte slice "b", the function returns nil values.
|
||||
//
|
||||
// While slightly less convenient than using the Graphemes class, this function
|
||||
// has much better performance and makes no allocations. It lends itself well to
|
||||
// large byte slices.
|
||||
//
|
||||
// Note that in accordance with [UAX #14 LB3], the final segment will end with
|
||||
// a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
|
||||
// to ignore this by checking if the length of the "rest" slice is 0 and calling
|
||||
// [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
|
||||
//
|
||||
// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
|
||||
func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(b) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRune(b)
|
||||
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
var prop int
|
||||
if state < 0 {
|
||||
prop = propertyGraphemes(r)
|
||||
} else {
|
||||
prop = state >> shiftPropState
|
||||
}
|
||||
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
var graphemeState, wordState, sentenceState, lineState, firstProp int
|
||||
remainder := b[length:]
|
||||
if state < 0 {
|
||||
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
|
||||
wordState, _ = transitionWordBreakState(state, r, remainder, "")
|
||||
sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
|
||||
lineState, _ = transitionLineBreakState(state, r, remainder, "")
|
||||
} else {
|
||||
graphemeState = state & maskGraphemeState
|
||||
wordState = (state >> shiftWordState) & maskWordState
|
||||
sentenceState = (state >> shiftSentenceState) & maskSentenceState
|
||||
lineState = (state >> shiftLineState) & maskLineState
|
||||
firstProp = state >> shiftPropState
|
||||
}
|
||||
|
||||
// Transition until we find a grapheme cluster boundary.
|
||||
width := runeWidth(r, firstProp)
|
||||
for {
|
||||
var (
|
||||
graphemeBoundary, wordBoundary, sentenceBoundary bool
|
||||
lineBreak, prop int
|
||||
)
|
||||
|
||||
r, l := utf8.DecodeRune(remainder)
|
||||
remainder = b[length+l:]
|
||||
|
||||
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
|
||||
wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
|
||||
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
|
||||
lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
|
||||
|
||||
if graphemeBoundary {
|
||||
boundary := lineBreak | (width << ShiftWidth)
|
||||
if wordBoundary {
|
||||
boundary |= 1 << shiftWord
|
||||
}
|
||||
if sentenceBoundary {
|
||||
boundary |= 1 << shiftSentence
|
||||
}
|
||||
return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
|
||||
}
|
||||
|
||||
if firstProp == prExtendedPictographic {
|
||||
if r == vs15 {
|
||||
width = 1
|
||||
} else if r == vs16 {
|
||||
width = 2
|
||||
}
|
||||
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
||||
width += runeWidth(r, prop)
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(b) <= length {
|
||||
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// StepString is like [Step] but its input and outputs are strings.
|
||||
func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(str) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRuneInString(str)
|
||||
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
prop := propertyGraphemes(r)
|
||||
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
var graphemeState, wordState, sentenceState, lineState, firstProp int
|
||||
remainder := str[length:]
|
||||
if state < 0 {
|
||||
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
|
||||
wordState, _ = transitionWordBreakState(state, r, nil, remainder)
|
||||
sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
|
||||
lineState, _ = transitionLineBreakState(state, r, nil, remainder)
|
||||
} else {
|
||||
graphemeState = state & maskGraphemeState
|
||||
wordState = (state >> shiftWordState) & maskWordState
|
||||
sentenceState = (state >> shiftSentenceState) & maskSentenceState
|
||||
lineState = (state >> shiftLineState) & maskLineState
|
||||
firstProp = state >> shiftPropState
|
||||
}
|
||||
|
||||
// Transition until we find a grapheme cluster boundary.
|
||||
width := runeWidth(r, firstProp)
|
||||
for {
|
||||
var (
|
||||
graphemeBoundary, wordBoundary, sentenceBoundary bool
|
||||
lineBreak, prop int
|
||||
)
|
||||
|
||||
r, l := utf8.DecodeRuneInString(remainder)
|
||||
remainder = str[length+l:]
|
||||
|
||||
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
|
||||
wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
|
||||
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
|
||||
lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
|
||||
|
||||
if graphemeBoundary {
|
||||
boundary := lineBreak | (width << ShiftWidth)
|
||||
if wordBoundary {
|
||||
boundary |= 1 << shiftWord
|
||||
}
|
||||
if sentenceBoundary {
|
||||
boundary |= 1 << shiftSentence
|
||||
}
|
||||
return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
|
||||
}
|
||||
|
||||
if firstProp == prExtendedPictographic {
|
||||
if r == vs15 {
|
||||
width = 1
|
||||
} else if r == vs16 {
|
||||
width = 2
|
||||
}
|
||||
} else if firstProp != prRegionalIndicator && firstProp != prL {
|
||||
width += runeWidth(r, prop)
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(str) <= length {
|
||||
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
|
||||
}
|
||||
}
|
||||
}
|
61
vendor/github.com/rivo/uniseg/width.go
generated
vendored
Normal file
61
vendor/github.com/rivo/uniseg/width.go
generated
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
package uniseg
|
||||
|
||||
// EastAsianAmbiguousWidth specifies the monospace width for East Asian
|
||||
// characters classified as Ambiguous. The default is 1 but some rare fonts
|
||||
// render them with a width of 2.
|
||||
var EastAsianAmbiguousWidth = 1
|
||||
|
||||
// runeWidth returns the monospace width for the given rune. The provided
|
||||
// grapheme property is a value mapped by the [graphemeCodePoints] table.
|
||||
//
|
||||
// Every rune has a width of 1, except for runes with the following properties
|
||||
// (evaluated in this order):
|
||||
//
|
||||
// - Control, CR, LF, Extend, ZWJ: Width of 0
|
||||
// - \u2e3a, TWO-EM DASH: Width of 3
|
||||
// - \u2e3b, THREE-EM DASH: Width of 4
|
||||
// - East-Asian width Fullwidth and Wide: Width of 2 (Ambiguous and Neutral
|
||||
// have a width of 1)
|
||||
// - Regional Indicator: Width of 2
|
||||
// - Extended Pictographic: Width of 2, unless Emoji Presentation is "No".
|
||||
func runeWidth(r rune, graphemeProperty int) int {
|
||||
switch graphemeProperty {
|
||||
case prControl, prCR, prLF, prExtend, prZWJ:
|
||||
return 0
|
||||
case prRegionalIndicator:
|
||||
return 2
|
||||
case prExtendedPictographic:
|
||||
if property(emojiPresentation, r) == prEmojiPresentation {
|
||||
return 2
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
switch r {
|
||||
case 0x2e3a:
|
||||
return 3
|
||||
case 0x2e3b:
|
||||
return 4
|
||||
}
|
||||
|
||||
switch propertyEastAsianWidth(r) {
|
||||
case prW, prF:
|
||||
return 2
|
||||
case prA:
|
||||
return EastAsianAmbiguousWidth
|
||||
}
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
// StringWidth returns the monospace width for the given string, that is, the
|
||||
// number of same-size cells to be occupied by the string.
|
||||
func StringWidth(s string) (width int) {
|
||||
state := -1
|
||||
for len(s) > 0 {
|
||||
var w int
|
||||
_, s, w, state = FirstGraphemeClusterInString(s, state)
|
||||
width += w
|
||||
}
|
||||
return
|
||||
}
|
89
vendor/github.com/rivo/uniseg/word.go
generated
vendored
Normal file
89
vendor/github.com/rivo/uniseg/word.go
generated
vendored
Normal file
@ -0,0 +1,89 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// FirstWord returns the first word found in the given byte slice according to
|
||||
// the rules of [Unicode Standard Annex #29, Word Boundaries]. This function can
|
||||
// be called continuously to extract all words from a byte slice, as illustrated
|
||||
// in the example below.
|
||||
//
|
||||
// If you don't know the current state, for example when calling the function
|
||||
// for the first time, you must pass -1. For consecutive calls, pass the state
|
||||
// and rest slice returned by the previous call.
|
||||
//
|
||||
// The "rest" slice is the sub-slice of the original byte slice "b" starting
|
||||
// after the last byte of the identified word. If the length of the "rest" slice
|
||||
// is 0, the entire byte slice "b" has been processed. The "word" byte slice is
|
||||
// the sub-slice of the input slice containing the identified word.
|
||||
//
|
||||
// Given an empty byte slice "b", the function returns nil values.
|
||||
//
|
||||
// [Unicode Standard Annex #29, Word Boundaries]: http://unicode.org/reports/tr29/#Word_Boundaries
|
||||
func FirstWord(b []byte, state int) (word, rest []byte, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(b) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRune(b)
|
||||
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return b, nil, wbAny
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionWordBreakState(state, r, b[length:], "")
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary bool
|
||||
for {
|
||||
r, l := utf8.DecodeRune(b[length:])
|
||||
state, boundary = transitionWordBreakState(state, r, b[length+l:], "")
|
||||
|
||||
if boundary {
|
||||
return b[:length], b[length:], state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(b) <= length {
|
||||
return b, nil, wbAny
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FirstWordInString is like [FirstWord] but its input and outputs are strings.
|
||||
func FirstWordInString(str string, state int) (word, rest string, newState int) {
|
||||
// An empty byte slice returns nothing.
|
||||
if len(str) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Extract the first rune.
|
||||
r, length := utf8.DecodeRuneInString(str)
|
||||
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
|
||||
return str, "", wbAny
|
||||
}
|
||||
|
||||
// If we don't know the state, determine it now.
|
||||
if state < 0 {
|
||||
state, _ = transitionWordBreakState(state, r, nil, str[length:])
|
||||
}
|
||||
|
||||
// Transition until we find a boundary.
|
||||
var boundary bool
|
||||
for {
|
||||
r, l := utf8.DecodeRuneInString(str[length:])
|
||||
state, boundary = transitionWordBreakState(state, r, nil, str[length+l:])
|
||||
|
||||
if boundary {
|
||||
return str[:length], str[length:], state
|
||||
}
|
||||
|
||||
length += l
|
||||
if len(str) <= length {
|
||||
return str, "", wbAny
|
||||
}
|
||||
}
|
||||
}
|
1883
vendor/github.com/rivo/uniseg/wordproperties.go
generated
vendored
Normal file
1883
vendor/github.com/rivo/uniseg/wordproperties.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
282
vendor/github.com/rivo/uniseg/wordrules.go
generated
vendored
Normal file
282
vendor/github.com/rivo/uniseg/wordrules.go
generated
vendored
Normal file
@ -0,0 +1,282 @@
|
||||
package uniseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// The states of the word break parser.
|
||||
const (
|
||||
wbAny = iota
|
||||
wbCR
|
||||
wbLF
|
||||
wbNewline
|
||||
wbWSegSpace
|
||||
wbHebrewLetter
|
||||
wbALetter
|
||||
wbWB7
|
||||
wbWB7c
|
||||
wbNumeric
|
||||
wbWB11
|
||||
wbKatakana
|
||||
wbExtendNumLet
|
||||
wbOddRI
|
||||
wbEvenRI
|
||||
wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
|
||||
)
|
||||
|
||||
// wbTransitions implements the word break parser's state transitions. It's
|
||||
// anologous to [grTransitions], see comments there for details.
|
||||
//
|
||||
// Unicode version 15.0.0.
|
||||
func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
|
||||
switch uint64(state) | uint64(prop)<<32 {
|
||||
// WB3b.
|
||||
case wbAny | prNewline<<32:
|
||||
return wbNewline, true, 32
|
||||
case wbAny | prCR<<32:
|
||||
return wbCR, true, 32
|
||||
case wbAny | prLF<<32:
|
||||
return wbLF, true, 32
|
||||
|
||||
// WB3a.
|
||||
case wbNewline | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
case wbCR | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
case wbLF | prAny<<32:
|
||||
return wbAny, true, 31
|
||||
|
||||
// WB3.
|
||||
case wbCR | prLF<<32:
|
||||
return wbLF, false, 30
|
||||
|
||||
// WB3d.
|
||||
case wbAny | prWSegSpace<<32:
|
||||
return wbWSegSpace, true, 9990
|
||||
case wbWSegSpace | prWSegSpace<<32:
|
||||
return wbWSegSpace, false, 34
|
||||
|
||||
// WB5.
|
||||
case wbAny | prALetter<<32:
|
||||
return wbALetter, true, 9990
|
||||
case wbAny | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, true, 9990
|
||||
case wbALetter | prALetter<<32:
|
||||
return wbALetter, false, 50
|
||||
case wbALetter | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 50
|
||||
case wbHebrewLetter | prALetter<<32:
|
||||
return wbALetter, false, 50
|
||||
case wbHebrewLetter | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 50
|
||||
|
||||
// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
|
||||
case wbWB7 | prALetter<<32:
|
||||
return wbALetter, false, 70
|
||||
case wbWB7 | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 70
|
||||
|
||||
// WB7a.
|
||||
case wbHebrewLetter | prSingleQuote<<32:
|
||||
return wbAny, false, 71
|
||||
|
||||
// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
|
||||
case wbWB7c | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 73
|
||||
|
||||
// WB8.
|
||||
case wbAny | prNumeric<<32:
|
||||
return wbNumeric, true, 9990
|
||||
case wbNumeric | prNumeric<<32:
|
||||
return wbNumeric, false, 80
|
||||
|
||||
// WB9.
|
||||
case wbALetter | prNumeric<<32:
|
||||
return wbNumeric, false, 90
|
||||
case wbHebrewLetter | prNumeric<<32:
|
||||
return wbNumeric, false, 90
|
||||
|
||||
// WB10.
|
||||
case wbNumeric | prALetter<<32:
|
||||
return wbALetter, false, 100
|
||||
case wbNumeric | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 100
|
||||
|
||||
// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
|
||||
case wbWB11 | prNumeric<<32:
|
||||
return wbNumeric, false, 110
|
||||
|
||||
// WB13.
|
||||
case wbAny | prKatakana<<32:
|
||||
return wbKatakana, true, 9990
|
||||
case wbKatakana | prKatakana<<32:
|
||||
return wbKatakana, false, 130
|
||||
|
||||
// WB13a.
|
||||
case wbAny | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, true, 9990
|
||||
case wbALetter | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbHebrewLetter | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbNumeric | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbKatakana | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
case wbExtendNumLet | prExtendNumLet<<32:
|
||||
return wbExtendNumLet, false, 131
|
||||
|
||||
// WB13b.
|
||||
case wbExtendNumLet | prALetter<<32:
|
||||
return wbALetter, false, 132
|
||||
case wbExtendNumLet | prHebrewLetter<<32:
|
||||
return wbHebrewLetter, false, 132
|
||||
case wbExtendNumLet | prNumeric<<32:
|
||||
return wbNumeric, false, 132
|
||||
case wbExtendNumLet | prKatakana<<32:
|
||||
return wbKatakana, false, 132
|
||||
|
||||
default:
|
||||
return -1, false, -1
|
||||
}
|
||||
}
|
||||
|
||||
// transitionWordBreakState determines the new state of the word break parser
|
||||
// given the current state and the next code point. It also returns whether a
|
||||
// word boundary was detected. If more than one code point is needed to
|
||||
// determine the new state, the byte slice or the string starting after rune "r"
|
||||
// can be used (whichever is not nil or empty) for further lookups.
|
||||
func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
|
||||
// Determine the property of the next character.
|
||||
nextProperty := property(workBreakCodePoints, r)
|
||||
|
||||
// "Replacing Ignore Rules".
|
||||
if nextProperty == prZWJ {
|
||||
// WB4 (for zero-width joiners).
|
||||
if state == wbNewline || state == wbCR || state == wbLF {
|
||||
return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
|
||||
}
|
||||
if state < 0 {
|
||||
return wbAny | wbZWJBit, false
|
||||
}
|
||||
return state | wbZWJBit, false
|
||||
} else if nextProperty == prExtend || nextProperty == prFormat {
|
||||
// WB4 (for Extend and Format).
|
||||
if state == wbNewline || state == wbCR || state == wbLF {
|
||||
return wbAny, true // Make sure we don't apply WB4 to WB3a.
|
||||
}
|
||||
if state == wbWSegSpace || state == wbAny|wbZWJBit {
|
||||
return wbAny, false // We don't break but this is also not WB3d or WB3c.
|
||||
}
|
||||
if state < 0 {
|
||||
return wbAny, false
|
||||
}
|
||||
return state, false
|
||||
} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
|
||||
// WB3c.
|
||||
return wbAny, false
|
||||
}
|
||||
if state >= 0 {
|
||||
state = state &^ wbZWJBit
|
||||
}
|
||||
|
||||
// Find the applicable transition in the table.
|
||||
var rule int
|
||||
newState, wordBreak, rule = wbTransitions(state, nextProperty)
|
||||
if newState < 0 {
|
||||
// No specific transition found. Try the less specific ones.
|
||||
anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
|
||||
anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
|
||||
if anyPropState >= 0 && anyStateState >= 0 {
|
||||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||||
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
|
||||
if anyPropRule < anyStateRule {
|
||||
wordBreak, rule = anyPropWordBreak, anyPropRule
|
||||
}
|
||||
} else if anyPropState >= 0 {
|
||||
// We only have a specific state.
|
||||
newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
|
||||
// This branch will probably never be reached because okAnyState will
|
||||
// always be true given the current transition map. But we keep it here
|
||||
// for future modifications to the transition map where this may not be
|
||||
// true anymore.
|
||||
} else if anyStateState >= 0 {
|
||||
// We only have a specific property.
|
||||
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
|
||||
} else {
|
||||
// No known transition. WB999: Any ÷ Any.
|
||||
newState, wordBreak, rule = wbAny, true, 9990
|
||||
}
|
||||
}
|
||||
|
||||
// For those rules that need to look up runes further in the string, we
|
||||
// determine the property after nextProperty, skipping over Format, Extend,
|
||||
// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
|
||||
// be determined (because the text ends or the rune is faulty).
|
||||
farProperty := -1
|
||||
if rule > 60 &&
|
||||
(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
|
||||
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
|
||||
nextProperty == prDoubleQuote || // WB7b.
|
||||
nextProperty == prMidNum) { // WB12.
|
||||
for {
|
||||
var (
|
||||
r rune
|
||||
length int
|
||||
)
|
||||
if b != nil { // Byte slice version.
|
||||
r, length = utf8.DecodeRune(b)
|
||||
b = b[length:]
|
||||
} else { // String version.
|
||||
r, length = utf8.DecodeRuneInString(str)
|
||||
str = str[length:]
|
||||
}
|
||||
if r == utf8.RuneError {
|
||||
break
|
||||
}
|
||||
prop := property(workBreakCodePoints, r)
|
||||
if prop == prExtend || prop == prFormat || prop == prZWJ {
|
||||
continue
|
||||
}
|
||||
farProperty = prop
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// WB6.
|
||||
if rule > 60 &&
|
||||
(state == wbALetter || state == wbHebrewLetter) &&
|
||||
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
||||
(farProperty == prALetter || farProperty == prHebrewLetter) {
|
||||
return wbWB7, false
|
||||
}
|
||||
|
||||
// WB7b.
|
||||
if rule > 72 &&
|
||||
state == wbHebrewLetter &&
|
||||
nextProperty == prDoubleQuote &&
|
||||
farProperty == prHebrewLetter {
|
||||
return wbWB7c, false
|
||||
}
|
||||
|
||||
// WB12.
|
||||
if rule > 120 &&
|
||||
state == wbNumeric &&
|
||||
(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
|
||||
farProperty == prNumeric {
|
||||
return wbWB11, false
|
||||
}
|
||||
|
||||
// WB15 and WB16.
|
||||
if newState == wbAny && nextProperty == prRegionalIndicator {
|
||||
if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
|
||||
// Transition into the first RI.
|
||||
return wbOddRI, true
|
||||
}
|
||||
if state == wbOddRI {
|
||||
// Don't break pairs of Regional Indicators.
|
||||
return wbEvenRI, false
|
||||
}
|
||||
return wbOddRI, true // We can break after a pair.
|
||||
}
|
||||
|
||||
return
|
||||
}
|
3
vendor/modules.txt
vendored
3
vendor/modules.txt
vendored
@ -597,6 +597,9 @@ github.com/quasoft/memstore
|
||||
# github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec
|
||||
## explicit; go 1.12
|
||||
github.com/remyoudompheng/bigfft
|
||||
# github.com/rivo/uniseg v0.4.7
|
||||
## explicit; go 1.18
|
||||
github.com/rivo/uniseg
|
||||
# github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a
|
||||
## explicit; go 1.22.0
|
||||
github.com/rogpeppe/go-internal/fmtsort
|
||||
|
Loading…
Reference in New Issue
Block a user