Improve titles, trim body to reasonable length

This commit is contained in:
Vyr Cossont 2025-01-19 16:19:50 -08:00
parent 13eda35985
commit 93aeadbd9f
30 changed files with 16492 additions and 9 deletions

1
go.mod
View File

@ -66,6 +66,7 @@ require (
github.com/ncruces/go-sqlite3 v0.22.0
github.com/oklog/ulid v1.3.1
github.com/prometheus/client_golang v1.20.5
github.com/rivo/uniseg v0.4.7
github.com/spf13/cobra v1.8.1
github.com/spf13/viper v1.19.0
github.com/stretchr/testify v1.10.0

2
go.sum generated
View File

@ -476,6 +476,8 @@ github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b h1:aUNXCGgukb4gtY
github.com/quasoft/memstore v0.0.0-20191010062613-2bce066d2b0b/go.mod h1:wTPjTepVu7uJBYgZ0SdWHQlIas582j6cn2jgk4DDdlg=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a h1:w3tdWGKbLGBPtR/8/oO74W6hmz0qE5q0z9aqSAewaaM=

View File

@ -56,7 +56,7 @@ type Notification struct {
NotificationPendingReply NotificationType = 10 // NotificationPendingReply -- Someone has replied to a status of yours, which requires approval by you.
NotificationPendingReblog NotificationType = 11 // NotificationPendingReblog -- Someone has boosted a status of yours, which requires approval by you.
NotificationAdminReport NotificationType = 12 // NotificationAdminReport -- someone has submitted a new report to the instance.
NotificationUpdate NotificationType = 13
NotificationUpdate NotificationType = 13 // NotificationUpdate -- someone has edited their status.
NotificationTypeNumValues NotificationType = 14 // NotificationTypeNumValues -- 1 + number of max notification type
)

View File

@ -0,0 +1,45 @@
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package text
import (
"github.com/rivo/uniseg"
)
// FirstNBytesByWords produces a prefix substring of up to n bytes from a given string, respecting Unicode grapheme and
// word boundaries. The substring may be empty, and may include leading or trailing whitespace.
func FirstNBytesByWords(s string, n int) string {
substringEnd := 0
graphemes := uniseg.NewGraphemes(s)
for graphemes.Next() {
if !graphemes.IsWordBoundary() {
continue
}
_, end := graphemes.Positions()
if end > n {
break
}
substringEnd = end
}
return s[0:substringEnd]
}

View File

@ -0,0 +1,47 @@
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package text_test
import (
"testing"
"github.com/stretchr/testify/suite"
"github.com/superseriousbusiness/gotosocial/internal/text"
)
type SubstringTestSuite struct {
suite.Suite
}
func (suite *SubstringTestSuite) TestText() {
suite.Equal(
"Sphinx of black quartz, ",
text.FirstNBytesByWords("Sphinx of black quartz, judge my vow!", 25),
)
}
func (suite *SubstringTestSuite) TestEmoji() {
suite.Equal(
"🏳️‍⚧️ ",
text.FirstNBytesByWords("🏳️‍⚧️ 🙈", 20),
)
}
func TestSubstringTestSuite(t *testing.T) {
suite.Run(t, new(SubstringTestSuite))
}

View File

@ -23,6 +23,7 @@
"fmt"
"io"
"net/http"
"strings"
"time"
webpushgo "github.com/SherClockHolmes/webpush-go"
@ -59,9 +60,13 @@ func NewRealSender(httpClient *http.Client, state *state.State) Sender {
// while waiting for the client to retrieve them.
const TTL = 48 * time.Hour
// responseBodyMaxLen limits how much of the Web Push server response we use for error messages.
// responseBodyMaxLen limits how much of the Web Push server response we read for error messages.
const responseBodyMaxLen = 1024
// bodyMaxLen is a polite maximum length for a Web Push notification's body text, in bytes.
// Note that this isn't limited per se, but Web Push servers may reject anything with a total request body size over 4k.
const bodyMaxLen = 3000
func (r *realSender) Send(
ctx context.Context,
notification *gtsmodel.Notification,
@ -126,7 +131,7 @@ func (r *realSender) Send(
vapidKeyPair,
vapidSubjectEmail,
subscription,
notification.TargetAccount,
notification,
apiNotification,
); err != nil {
log.Errorf(
@ -148,7 +153,7 @@ func (r *realSender) sendToSubscription(
vapidKeyPair *gtsmodel.VAPIDKeyPair,
vapidSubjectEmail string,
subscription *gtsmodel.WebPushSubscription,
targetAccount *gtsmodel.Account,
notification *gtsmodel.Notification,
apiNotification *apimodel.Notification,
) error {
// Get the associated access token.
@ -162,7 +167,7 @@ func (r *realSender) sendToSubscription(
NotificationID: apiNotification.ID,
NotificationType: apiNotification.Type,
Icon: apiNotification.Account.Avatar,
PreferredLocale: targetAccount.Settings.Language,
PreferredLocale: notification.TargetAccount.Settings.Language,
AccessToken: token.Access,
}
@ -171,8 +176,45 @@ func (r *realSender) sendToSubscription(
if displayNameOrAcct == "" {
displayNameOrAcct = apiNotification.Account.Acct
}
// TODO: (Vyr) improve copy
pushNotification.Title = fmt.Sprintf("%s from %s", apiNotification.Type, displayNameOrAcct)
switch notification.NotificationType {
case gtsmodel.NotificationFollow:
pushNotification.Title = fmt.Sprintf("%s followed you", displayNameOrAcct)
case gtsmodel.NotificationFollowRequest:
pushNotification.Title = fmt.Sprintf("%s requested to follow you", displayNameOrAcct)
case gtsmodel.NotificationMention:
pushNotification.Title = fmt.Sprintf("%s mentioned you", displayNameOrAcct)
case gtsmodel.NotificationReblog:
pushNotification.Title = fmt.Sprintf("%s boosted your post", displayNameOrAcct)
case gtsmodel.NotificationFavourite:
pushNotification.Title = fmt.Sprintf("%s faved your post", displayNameOrAcct)
case gtsmodel.NotificationPoll:
if subscription.AccountID == notification.TargetAccountID {
pushNotification.Title = fmt.Sprintf("Your poll has ended")
} else {
pushNotification.Title = fmt.Sprintf("%s's poll has ended", displayNameOrAcct)
}
case gtsmodel.NotificationStatus:
pushNotification.Title = fmt.Sprintf("%s posted", displayNameOrAcct)
case gtsmodel.NotificationAdminSignup:
pushNotification.Title = fmt.Sprintf("%s requested to sign up", displayNameOrAcct)
case gtsmodel.NotificationPendingFave:
pushNotification.Title = fmt.Sprintf("%s faved your post, which requires your approval", displayNameOrAcct)
case gtsmodel.NotificationPendingReply:
pushNotification.Title = fmt.Sprintf("%s mentioned you, which requires your approval", displayNameOrAcct)
case gtsmodel.NotificationPendingReblog:
pushNotification.Title = fmt.Sprintf("%s boosted your post, which requires your approval", displayNameOrAcct)
case gtsmodel.NotificationAdminReport:
pushNotification.Title = fmt.Sprintf("%s submitted a report", displayNameOrAcct)
case gtsmodel.NotificationUpdate:
pushNotification.Title = fmt.Sprintf("%s updated their post", displayNameOrAcct)
default:
log.Warnf(ctx, "Unknown notification type: %d", notification.NotificationType)
pushNotification.Title = fmt.Sprintf(
"%s did something (unknown notification type %d)",
displayNameOrAcct,
notification.NotificationType,
)
}
// Set the notification body.
if apiNotification.Status != nil {
@ -184,7 +226,7 @@ func (r *realSender) sendToSubscription(
} else {
pushNotification.Body = text.SanitizeToPlaintext(apiNotification.Account.Note)
}
// TODO: (Vyr) trim this
pushNotification.Body = firstNBytesTrimSpace(pushNotification.Body, bodyMaxLen)
// Encode the push notification as JSON.
pushNotificationBytes, err := json.Marshal(pushNotification)
@ -221,6 +263,7 @@ func (r *realSender) sendToSubscription(
if resp.StatusCode < 200 || resp.StatusCode > 299 {
if resp.StatusCode >= 400 && resp.StatusCode <= 499 &&
resp.StatusCode != http.StatusRequestTimeout &&
resp.StatusCode != http.StatusRequestEntityTooLarge &&
resp.StatusCode != http.StatusTooManyRequests {
// We should not send any more notifications to this subscription. Try to delete it.
if err := r.state.DB.DeleteWebPushSubscriptionByTokenID(ctx, subscription.TokenID); err != nil {
@ -256,6 +299,11 @@ func (r *realSender) sendToSubscription(
return nil
}
// firstNBytesTrimSpace returns the first N bytes of a string, trimming leading and trailing whitespace.
func firstNBytesTrimSpace(s string, n int) string {
return strings.TrimSpace(text.FirstNBytesByWords(strings.TrimSpace(s), n))
}
// gtsHTTPClientRoundTripper helps wrap a GtS HTTP client back into a regular HTTP client,
// so that webpush-go can use our IP filters, bad hosts list, and retries.
type gtsHTTPClientRoundTripper struct {

View File

@ -168,7 +168,7 @@ type notifyingReadCloser struct {
bodyClosed chan struct{}
}
func (rc *notifyingReadCloser) Read(p []byte) (n int, err error) {
func (rc *notifyingReadCloser) Read(_ []byte) (n int, err error) {
return 0, io.EOF
}

21
vendor/github.com/rivo/uniseg/LICENSE.txt generated vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Oliver Kuederle
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

137
vendor/github.com/rivo/uniseg/README.md generated vendored Normal file
View File

@ -0,0 +1,137 @@
# Unicode Text Segmentation for Go
[![Go Reference](https://pkg.go.dev/badge/github.com/rivo/uniseg.svg)](https://pkg.go.dev/github.com/rivo/uniseg)
[![Go Report](https://img.shields.io/badge/go%20report-A%2B-brightgreen.svg)](https://goreportcard.com/report/github.com/rivo/uniseg)
This Go package implements Unicode Text Segmentation according to [Unicode Standard Annex #29](https://unicode.org/reports/tr29/), Unicode Line Breaking according to [Unicode Standard Annex #14](https://unicode.org/reports/tr14/) (Unicode version 15.0.0), and monospace font string width calculation similar to [wcwidth](https://man7.org/linux/man-pages/man3/wcwidth.3.html).
## Background
### Grapheme Clusters
In Go, [strings are read-only slices of bytes](https://go.dev/blog/strings). They can be turned into Unicode code points using the `for` loop or by casting: `[]rune(str)`. However, multiple code points may be combined into one user-perceived character or what the Unicode specification calls "grapheme cluster". Here are some examples:
|String|Bytes (UTF-8)|Code points (runes)|Grapheme clusters|
|-|-|-|-|
|Käse|6 bytes: `4b 61 cc 88 73 65`|5 code points: `4b 61 308 73 65`|4 clusters: `[4b],[61 308],[73],[65]`|
|🏳️‍🌈|14 bytes: `f0 9f 8f b3 ef b8 8f e2 80 8d f0 9f 8c 88`|4 code points: `1f3f3 fe0f 200d 1f308`|1 cluster: `[1f3f3 fe0f 200d 1f308]`|
|🇩🇪|8 bytes: `f0 9f 87 a9 f0 9f 87 aa`|2 code points: `1f1e9 1f1ea`|1 cluster: `[1f1e9 1f1ea]`|
This package provides tools to iterate over these grapheme clusters. This may be used to determine the number of user-perceived characters, to split strings in their intended places, or to extract individual characters which form a unit.
### Word Boundaries
Word boundaries are used in a number of different contexts. The most familiar ones are selection (double-click mouse selection), cursor movement ("move to next word" control-arrow keys), and the dialog option "Whole Word Search" for search and replace. They are also used in database queries, to determine whether elements are within a certain number of words of one another. Searching may also use word boundaries in determining matching items. This package provides tools to determine word boundaries within strings.
### Sentence Boundaries
Sentence boundaries are often used for triple-click or some other method of selecting or iterating through blocks of text that are larger than single words. They are also used to determine whether words occur within the same sentence in database queries. This package provides tools to determine sentence boundaries within strings.
### Line Breaking
Line breaking, also known as word wrapping, is the process of breaking a section of text into lines such that it will fit in the available width of a page, window or other display area. This package provides tools to determine where a string may or may not be broken and where it must be broken (for example after newline characters).
### Monospace Width
Most terminals or text displays / text editors using a monospace font (for example source code editors) use a fixed width for each character. Some characters such as emojis or characters found in Asian and other languages may take up more than one character cell. This package provides tools to determine the number of cells a string will take up when displayed in a monospace font. See [here](https://pkg.go.dev/github.com/rivo/uniseg#hdr-Monospace_Width) for more information.
## Installation
```bash
go get github.com/rivo/uniseg
```
## Examples
### Counting Characters in a String
```go
n := uniseg.GraphemeClusterCount("🇩🇪🏳️‍🌈")
fmt.Println(n)
// 2
```
### Calculating the Monospace String Width
```go
width := uniseg.StringWidth("🇩🇪🏳️‍🌈!")
fmt.Println(width)
// 5
```
### Using the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) Class
This is the most convenient method of iterating over grapheme clusters:
```go
gr := uniseg.NewGraphemes("👍🏼!")
for gr.Next() {
fmt.Printf("%x ", gr.Runes())
}
// [1f44d 1f3fc] [21]
```
### Using the [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) Function
This avoids allocating a new `Graphemes` object but it requires the handling of states and boundaries:
```go
str := "🇩🇪🏳️‍🌈"
state := -1
var c string
for len(str) > 0 {
c, str, _, state = uniseg.StepString(str, state)
fmt.Printf("%x ", []rune(c))
}
// [1f1e9 1f1ea] [1f3f3 fe0f 200d 1f308]
```
### Advanced Examples
The [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class offers the most convenient way to access all functionality of this package. But in some cases, it may be better to use the specialized functions directly. For example, if you're only interested in word segmentation, use [`FirstWord`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWord) or [`FirstWordInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstWordInString):
```go
str := "Hello, world!"
state := -1
var c string
for len(str) > 0 {
c, str, state = uniseg.FirstWordInString(str, state)
fmt.Printf("(%s)\n", c)
}
// (Hello)
// (,)
// ( )
// (world)
// (!)
```
Similarly, use
- [`FirstGraphemeCluster`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeCluster) or [`FirstGraphemeClusterInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeClusterInString) for grapheme cluster determination only,
- [`FirstSentence`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentence) or [`FirstSentenceInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstSentenceInString) for sentence segmentation only, and
- [`FirstLineSegment`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegment) or [`FirstLineSegmentInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstLineSegmentInString) for line breaking / word wrapping (although using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step) or [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString) is preferred as it will observe grapheme cluster boundaries).
If you're only interested in the width of characters, use [`FirstGraphemeCluster`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeCluster) or [`FirstGraphemeClusterInString`](https://pkg.go.dev/github.com/rivo/uniseg#FirstGraphemeClusterInString). It is much faster than using [`Step`](https://pkg.go.dev/github.com/rivo/uniseg#Step), [`StepString`](https://pkg.go.dev/github.com/rivo/uniseg#StepString), or the [`Graphemes`](https://pkg.go.dev/github.com/rivo/uniseg#Graphemes) class because it does not include the logic for word / sentence / line boundaries.
Finally, if you need to reverse a string while preserving grapheme clusters, use [`ReverseString`](https://pkg.go.dev/github.com/rivo/uniseg#ReverseString):
```go
fmt.Println(uniseg.ReverseString("🇩🇪🏳️‍🌈"))
// 🏳️‍🌈🇩🇪
```
## Documentation
Refer to https://pkg.go.dev/github.com/rivo/uniseg for the package's documentation.
## Dependencies
This package does not depend on any packages outside the standard library.
## Sponsor this Project
[Become a Sponsor on GitHub](https://github.com/sponsors/rivo?metadata_source=uniseg_readme) to support this project!
## Your Feedback
Add your issue here on GitHub, preferably before submitting any PR's. Feel free to get in touch if you have any questions.

108
vendor/github.com/rivo/uniseg/doc.go generated vendored Normal file
View File

@ -0,0 +1,108 @@
/*
Package uniseg implements Unicode Text Segmentation, Unicode Line Breaking, and
string width calculation for monospace fonts. Unicode Text Segmentation conforms
to Unicode Standard Annex #29 (https://unicode.org/reports/tr29/) and Unicode
Line Breaking conforms to Unicode Standard Annex #14
(https://unicode.org/reports/tr14/).
In short, using this package, you can split a string into grapheme clusters
(what people would usually refer to as a "character"), into words, and into
sentences. Or, in its simplest case, this package allows you to count the number
of characters in a string, especially when it contains complex characters such
as emojis, combining characters, or characters from Asian, Arabic, Hebrew, or
other languages. Additionally, you can use it to implement line breaking (or
"word wrapping"), that is, to determine where text can be broken over to the
next line when the width of the line is not big enough to fit the entire text.
Finally, you can use it to calculate the display width of a string for monospace
fonts.
# Getting Started
If you just want to count the number of characters in a string, you can use
[GraphemeClusterCount]. If you want to determine the display width of a string,
you can use [StringWidth]. If you want to iterate over a string, you can use
[Step], [StepString], or the [Graphemes] class (more convenient but less
performant). This will provide you with all information: grapheme clusters,
word boundaries, sentence boundaries, line breaks, and monospace character
widths. The specialized functions [FirstGraphemeCluster],
[FirstGraphemeClusterInString], [FirstWord], [FirstWordInString],
[FirstSentence], and [FirstSentenceInString] can be used if only one type of
information is needed.
# Grapheme Clusters
Consider the rainbow flag emoji: 🏳🌈. On most modern systems, it appears as one
character. But its string representation actually has 14 bytes, so counting
bytes (or using len("🏳️‍🌈")) will not work as expected. Counting runes won't,
either: The flag has 4 Unicode code points, thus 4 runes. The stdlib function
utf8.RuneCountInString("🏳️‍🌈") and len([]rune("🏳️‍🌈")) will both return 4.
The [GraphemeClusterCount] function will return 1 for the rainbow flag emoji.
The Graphemes class and a variety of functions in this package will allow you to
split strings into its grapheme clusters.
# Word Boundaries
Word boundaries are used in a number of different contexts. The most familiar
ones are selection (double-click mouse selection), cursor movement ("move to
next word" control-arrow keys), and the dialog option "Whole Word Search" for
search and replace. This package provides methods for determining word
boundaries.
# Sentence Boundaries
Sentence boundaries are often used for triple-click or some other method of
selecting or iterating through blocks of text that are larger than single words.
They are also used to determine whether words occur within the same sentence in
database queries. This package provides methods for determining sentence
boundaries.
# Line Breaking
Line breaking, also known as word wrapping, is the process of breaking a section
of text into lines such that it will fit in the available width of a page,
window or other display area. This package provides methods to determine the
positions in a string where a line must be broken, may be broken, or must not be
broken.
# Monospace Width
Monospace width, as referred to in this package, is the width of a string in a
monospace font. This is commonly used in terminal user interfaces or text
displays or editors that don't support proportional fonts. A width of 1
corresponds to a single character cell. The C function [wcswidth()] and its
implementation in other programming languages is in widespread use for the same
purpose. However, there is no standard for the calculation of such widths, and
this package differs from wcswidth() in a number of ways, presumably to generate
more visually pleasing results.
To start, we assume that every code point has a width of 1, with the following
exceptions:
- Code points with grapheme cluster break properties Control, CR, LF, Extend,
and ZWJ have a width of 0.
- U+2E3A, Two-Em Dash, has a width of 3.
- U+2E3B, Three-Em Dash, has a width of 4.
- Characters with the East-Asian Width properties "Fullwidth" (F) and "Wide"
(W) have a width of 2. (Properties "Ambiguous" (A) and "Neutral" (N) both
have a width of 1.)
- Code points with grapheme cluster break property Regional Indicator have a
width of 2.
- Code points with grapheme cluster break property Extended Pictographic have
a width of 2, unless their Emoji Presentation flag is "No", in which case
the width is 1.
For Hangul grapheme clusters composed of conjoining Jamo and for Regional
Indicators (flags), all code points except the first one have a width of 0. For
grapheme clusters starting with an Extended Pictographic, any additional code
point will force a total width of 2, except if the Variation Selector-15
(U+FE0E) is included, in which case the total width is always 1. Grapheme
clusters ending with Variation Selector-16 (U+FE0F) have a width of 2.
Note that whether these widths appear correct depends on your application's
render engine, to which extent it conforms to the Unicode Standard, and its
choice of font.
[wcswidth()]: https://man7.org/linux/man-pages/man3/wcswidth.3.html
*/
package uniseg

2588
vendor/github.com/rivo/uniseg/eastasianwidth.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

295
vendor/github.com/rivo/uniseg/emojipresentation.go generated vendored Normal file
View File

@ -0,0 +1,295 @@
// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// emojiPresentation are taken from
//
// and
// https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
// ("Extended_Pictographic" only)
// on September 5, 2023. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var emojiPresentation = [][3]int{
{0x231A, 0x231B, prEmojiPresentation}, // E0.6 [2] (⌚..⌛) watch..hourglass done
{0x23E9, 0x23EC, prEmojiPresentation}, // E0.6 [4] (⏩..⏬) fast-forward button..fast down button
{0x23F0, 0x23F0, prEmojiPresentation}, // E0.6 [1] (⏰) alarm clock
{0x23F3, 0x23F3, prEmojiPresentation}, // E0.6 [1] (⏳) hourglass not done
{0x25FD, 0x25FE, prEmojiPresentation}, // E0.6 [2] (◽..◾) white medium-small square..black medium-small square
{0x2614, 0x2615, prEmojiPresentation}, // E0.6 [2] (☔..☕) umbrella with rain drops..hot beverage
{0x2648, 0x2653, prEmojiPresentation}, // E0.6 [12] (♈..♓) Aries..Pisces
{0x267F, 0x267F, prEmojiPresentation}, // E0.6 [1] (♿) wheelchair symbol
{0x2693, 0x2693, prEmojiPresentation}, // E0.6 [1] (⚓) anchor
{0x26A1, 0x26A1, prEmojiPresentation}, // E0.6 [1] (⚡) high voltage
{0x26AA, 0x26AB, prEmojiPresentation}, // E0.6 [2] (⚪..⚫) white circle..black circle
{0x26BD, 0x26BE, prEmojiPresentation}, // E0.6 [2] (⚽..⚾) soccer ball..baseball
{0x26C4, 0x26C5, prEmojiPresentation}, // E0.6 [2] (⛄..⛅) snowman without snow..sun behind cloud
{0x26CE, 0x26CE, prEmojiPresentation}, // E0.6 [1] (⛎) Ophiuchus
{0x26D4, 0x26D4, prEmojiPresentation}, // E0.6 [1] (⛔) no entry
{0x26EA, 0x26EA, prEmojiPresentation}, // E0.6 [1] (⛪) church
{0x26F2, 0x26F3, prEmojiPresentation}, // E0.6 [2] (⛲..⛳) fountain..flag in hole
{0x26F5, 0x26F5, prEmojiPresentation}, // E0.6 [1] (⛵) sailboat
{0x26FA, 0x26FA, prEmojiPresentation}, // E0.6 [1] (⛺) tent
{0x26FD, 0x26FD, prEmojiPresentation}, // E0.6 [1] (⛽) fuel pump
{0x2705, 0x2705, prEmojiPresentation}, // E0.6 [1] (✅) check mark button
{0x270A, 0x270B, prEmojiPresentation}, // E0.6 [2] (✊..✋) raised fist..raised hand
{0x2728, 0x2728, prEmojiPresentation}, // E0.6 [1] (✨) sparkles
{0x274C, 0x274C, prEmojiPresentation}, // E0.6 [1] (❌) cross mark
{0x274E, 0x274E, prEmojiPresentation}, // E0.6 [1] (❎) cross mark button
{0x2753, 0x2755, prEmojiPresentation}, // E0.6 [3] (❓..❕) red question mark..white exclamation mark
{0x2757, 0x2757, prEmojiPresentation}, // E0.6 [1] (❗) red exclamation mark
{0x2795, 0x2797, prEmojiPresentation}, // E0.6 [3] (..➗) plus..divide
{0x27B0, 0x27B0, prEmojiPresentation}, // E0.6 [1] (➰) curly loop
{0x27BF, 0x27BF, prEmojiPresentation}, // E1.0 [1] (➿) double curly loop
{0x2B1B, 0x2B1C, prEmojiPresentation}, // E0.6 [2] (⬛..⬜) black large square..white large square
{0x2B50, 0x2B50, prEmojiPresentation}, // E0.6 [1] (⭐) star
{0x2B55, 0x2B55, prEmojiPresentation}, // E0.6 [1] (⭕) hollow red circle
{0x1F004, 0x1F004, prEmojiPresentation}, // E0.6 [1] (🀄) mahjong red dragon
{0x1F0CF, 0x1F0CF, prEmojiPresentation}, // E0.6 [1] (🃏) joker
{0x1F18E, 0x1F18E, prEmojiPresentation}, // E0.6 [1] (🆎) AB button (blood type)
{0x1F191, 0x1F19A, prEmojiPresentation}, // E0.6 [10] (🆑..🆚) CL button..VS button
{0x1F1E6, 0x1F1FF, prEmojiPresentation}, // E0.0 [26] (🇦..🇿) regional indicator symbol letter a..regional indicator symbol letter z
{0x1F201, 0x1F201, prEmojiPresentation}, // E0.6 [1] (🈁) Japanese “here” button
{0x1F21A, 0x1F21A, prEmojiPresentation}, // E0.6 [1] (🈚) Japanese “free of charge” button
{0x1F22F, 0x1F22F, prEmojiPresentation}, // E0.6 [1] (🈯) Japanese “reserved” button
{0x1F232, 0x1F236, prEmojiPresentation}, // E0.6 [5] (🈲..🈶) Japanese “prohibited” button..Japanese “not free of charge” button
{0x1F238, 0x1F23A, prEmojiPresentation}, // E0.6 [3] (🈸..🈺) Japanese “application” button..Japanese “open for business” button
{0x1F250, 0x1F251, prEmojiPresentation}, // E0.6 [2] (🉐..🉑) Japanese “bargain” button..Japanese “acceptable” button
{0x1F300, 0x1F30C, prEmojiPresentation}, // E0.6 [13] (🌀..🌌) cyclone..milky way
{0x1F30D, 0x1F30E, prEmojiPresentation}, // E0.7 [2] (🌍..🌎) globe showing Europe-Africa..globe showing Americas
{0x1F30F, 0x1F30F, prEmojiPresentation}, // E0.6 [1] (🌏) globe showing Asia-Australia
{0x1F310, 0x1F310, prEmojiPresentation}, // E1.0 [1] (🌐) globe with meridians
{0x1F311, 0x1F311, prEmojiPresentation}, // E0.6 [1] (🌑) new moon
{0x1F312, 0x1F312, prEmojiPresentation}, // E1.0 [1] (🌒) waxing crescent moon
{0x1F313, 0x1F315, prEmojiPresentation}, // E0.6 [3] (🌓..🌕) first quarter moon..full moon
{0x1F316, 0x1F318, prEmojiPresentation}, // E1.0 [3] (🌖..🌘) waning gibbous moon..waning crescent moon
{0x1F319, 0x1F319, prEmojiPresentation}, // E0.6 [1] (🌙) crescent moon
{0x1F31A, 0x1F31A, prEmojiPresentation}, // E1.0 [1] (🌚) new moon face
{0x1F31B, 0x1F31B, prEmojiPresentation}, // E0.6 [1] (🌛) first quarter moon face
{0x1F31C, 0x1F31C, prEmojiPresentation}, // E0.7 [1] (🌜) last quarter moon face
{0x1F31D, 0x1F31E, prEmojiPresentation}, // E1.0 [2] (🌝..🌞) full moon face..sun with face
{0x1F31F, 0x1F320, prEmojiPresentation}, // E0.6 [2] (🌟..🌠) glowing star..shooting star
{0x1F32D, 0x1F32F, prEmojiPresentation}, // E1.0 [3] (🌭..🌯) hot dog..burrito
{0x1F330, 0x1F331, prEmojiPresentation}, // E0.6 [2] (🌰..🌱) chestnut..seedling
{0x1F332, 0x1F333, prEmojiPresentation}, // E1.0 [2] (🌲..🌳) evergreen tree..deciduous tree
{0x1F334, 0x1F335, prEmojiPresentation}, // E0.6 [2] (🌴..🌵) palm tree..cactus
{0x1F337, 0x1F34A, prEmojiPresentation}, // E0.6 [20] (🌷..🍊) tulip..tangerine
{0x1F34B, 0x1F34B, prEmojiPresentation}, // E1.0 [1] (🍋) lemon
{0x1F34C, 0x1F34F, prEmojiPresentation}, // E0.6 [4] (🍌..🍏) banana..green apple
{0x1F350, 0x1F350, prEmojiPresentation}, // E1.0 [1] (🍐) pear
{0x1F351, 0x1F37B, prEmojiPresentation}, // E0.6 [43] (🍑..🍻) peach..clinking beer mugs
{0x1F37C, 0x1F37C, prEmojiPresentation}, // E1.0 [1] (🍼) baby bottle
{0x1F37E, 0x1F37F, prEmojiPresentation}, // E1.0 [2] (🍾..🍿) bottle with popping cork..popcorn
{0x1F380, 0x1F393, prEmojiPresentation}, // E0.6 [20] (🎀..🎓) ribbon..graduation cap
{0x1F3A0, 0x1F3C4, prEmojiPresentation}, // E0.6 [37] (🎠..🏄) carousel horse..person surfing
{0x1F3C5, 0x1F3C5, prEmojiPresentation}, // E1.0 [1] (🏅) sports medal
{0x1F3C6, 0x1F3C6, prEmojiPresentation}, // E0.6 [1] (🏆) trophy
{0x1F3C7, 0x1F3C7, prEmojiPresentation}, // E1.0 [1] (🏇) horse racing
{0x1F3C8, 0x1F3C8, prEmojiPresentation}, // E0.6 [1] (🏈) american football
{0x1F3C9, 0x1F3C9, prEmojiPresentation}, // E1.0 [1] (🏉) rugby football
{0x1F3CA, 0x1F3CA, prEmojiPresentation}, // E0.6 [1] (🏊) person swimming
{0x1F3CF, 0x1F3D3, prEmojiPresentation}, // E1.0 [5] (🏏..🏓) cricket game..ping pong
{0x1F3E0, 0x1F3E3, prEmojiPresentation}, // E0.6 [4] (🏠..🏣) house..Japanese post office
{0x1F3E4, 0x1F3E4, prEmojiPresentation}, // E1.0 [1] (🏤) post office
{0x1F3E5, 0x1F3F0, prEmojiPresentation}, // E0.6 [12] (🏥..🏰) hospital..castle
{0x1F3F4, 0x1F3F4, prEmojiPresentation}, // E1.0 [1] (🏴) black flag
{0x1F3F8, 0x1F407, prEmojiPresentation}, // E1.0 [16] (🏸..🐇) badminton..rabbit
{0x1F408, 0x1F408, prEmojiPresentation}, // E0.7 [1] (🐈) cat
{0x1F409, 0x1F40B, prEmojiPresentation}, // E1.0 [3] (🐉..🐋) dragon..whale
{0x1F40C, 0x1F40E, prEmojiPresentation}, // E0.6 [3] (🐌..🐎) snail..horse
{0x1F40F, 0x1F410, prEmojiPresentation}, // E1.0 [2] (🐏..🐐) ram..goat
{0x1F411, 0x1F412, prEmojiPresentation}, // E0.6 [2] (🐑..🐒) ewe..monkey
{0x1F413, 0x1F413, prEmojiPresentation}, // E1.0 [1] (🐓) rooster
{0x1F414, 0x1F414, prEmojiPresentation}, // E0.6 [1] (🐔) chicken
{0x1F415, 0x1F415, prEmojiPresentation}, // E0.7 [1] (🐕) dog
{0x1F416, 0x1F416, prEmojiPresentation}, // E1.0 [1] (🐖) pig
{0x1F417, 0x1F429, prEmojiPresentation}, // E0.6 [19] (🐗..🐩) boar..poodle
{0x1F42A, 0x1F42A, prEmojiPresentation}, // E1.0 [1] (🐪) camel
{0x1F42B, 0x1F43E, prEmojiPresentation}, // E0.6 [20] (🐫..🐾) two-hump camel..paw prints
{0x1F440, 0x1F440, prEmojiPresentation}, // E0.6 [1] (👀) eyes
{0x1F442, 0x1F464, prEmojiPresentation}, // E0.6 [35] (👂..👤) ear..bust in silhouette
{0x1F465, 0x1F465, prEmojiPresentation}, // E1.0 [1] (👥) busts in silhouette
{0x1F466, 0x1F46B, prEmojiPresentation}, // E0.6 [6] (👦..👫) boy..woman and man holding hands
{0x1F46C, 0x1F46D, prEmojiPresentation}, // E1.0 [2] (👬..👭) men holding hands..women holding hands
{0x1F46E, 0x1F4AC, prEmojiPresentation}, // E0.6 [63] (👮..💬) police officer..speech balloon
{0x1F4AD, 0x1F4AD, prEmojiPresentation}, // E1.0 [1] (💭) thought balloon
{0x1F4AE, 0x1F4B5, prEmojiPresentation}, // E0.6 [8] (💮..💵) white flower..dollar banknote
{0x1F4B6, 0x1F4B7, prEmojiPresentation}, // E1.0 [2] (💶..💷) euro banknote..pound banknote
{0x1F4B8, 0x1F4EB, prEmojiPresentation}, // E0.6 [52] (💸..📫) money with wings..closed mailbox with raised flag
{0x1F4EC, 0x1F4ED, prEmojiPresentation}, // E0.7 [2] (📬..📭) open mailbox with raised flag..open mailbox with lowered flag
{0x1F4EE, 0x1F4EE, prEmojiPresentation}, // E0.6 [1] (📮) postbox
{0x1F4EF, 0x1F4EF, prEmojiPresentation}, // E1.0 [1] (📯) postal horn
{0x1F4F0, 0x1F4F4, prEmojiPresentation}, // E0.6 [5] (📰..📴) newspaper..mobile phone off
{0x1F4F5, 0x1F4F5, prEmojiPresentation}, // E1.0 [1] (📵) no mobile phones
{0x1F4F6, 0x1F4F7, prEmojiPresentation}, // E0.6 [2] (📶..📷) antenna bars..camera
{0x1F4F8, 0x1F4F8, prEmojiPresentation}, // E1.0 [1] (📸) camera with flash
{0x1F4F9, 0x1F4FC, prEmojiPresentation}, // E0.6 [4] (📹..📼) video camera..videocassette
{0x1F4FF, 0x1F502, prEmojiPresentation}, // E1.0 [4] (📿..🔂) prayer beads..repeat single button
{0x1F503, 0x1F503, prEmojiPresentation}, // E0.6 [1] (🔃) clockwise vertical arrows
{0x1F504, 0x1F507, prEmojiPresentation}, // E1.0 [4] (🔄..🔇) counterclockwise arrows button..muted speaker
{0x1F508, 0x1F508, prEmojiPresentation}, // E0.7 [1] (🔈) speaker low volume
{0x1F509, 0x1F509, prEmojiPresentation}, // E1.0 [1] (🔉) speaker medium volume
{0x1F50A, 0x1F514, prEmojiPresentation}, // E0.6 [11] (🔊..🔔) speaker high volume..bell
{0x1F515, 0x1F515, prEmojiPresentation}, // E1.0 [1] (🔕) bell with slash
{0x1F516, 0x1F52B, prEmojiPresentation}, // E0.6 [22] (🔖..🔫) bookmark..water pistol
{0x1F52C, 0x1F52D, prEmojiPresentation}, // E1.0 [2] (🔬..🔭) microscope..telescope
{0x1F52E, 0x1F53D, prEmojiPresentation}, // E0.6 [16] (🔮..🔽) crystal ball..downwards button
{0x1F54B, 0x1F54E, prEmojiPresentation}, // E1.0 [4] (🕋..🕎) kaaba..menorah
{0x1F550, 0x1F55B, prEmojiPresentation}, // E0.6 [12] (🕐..🕛) one oclock..twelve oclock
{0x1F55C, 0x1F567, prEmojiPresentation}, // E0.7 [12] (🕜..🕧) one-thirty..twelve-thirty
{0x1F57A, 0x1F57A, prEmojiPresentation}, // E3.0 [1] (🕺) man dancing
{0x1F595, 0x1F596, prEmojiPresentation}, // E1.0 [2] (🖕..🖖) middle finger..vulcan salute
{0x1F5A4, 0x1F5A4, prEmojiPresentation}, // E3.0 [1] (🖤) black heart
{0x1F5FB, 0x1F5FF, prEmojiPresentation}, // E0.6 [5] (🗻..🗿) mount fuji..moai
{0x1F600, 0x1F600, prEmojiPresentation}, // E1.0 [1] (😀) grinning face
{0x1F601, 0x1F606, prEmojiPresentation}, // E0.6 [6] (😁..😆) beaming face with smiling eyes..grinning squinting face
{0x1F607, 0x1F608, prEmojiPresentation}, // E1.0 [2] (😇..😈) smiling face with halo..smiling face with horns
{0x1F609, 0x1F60D, prEmojiPresentation}, // E0.6 [5] (😉..😍) winking face..smiling face with heart-eyes
{0x1F60E, 0x1F60E, prEmojiPresentation}, // E1.0 [1] (😎) smiling face with sunglasses
{0x1F60F, 0x1F60F, prEmojiPresentation}, // E0.6 [1] (😏) smirking face
{0x1F610, 0x1F610, prEmojiPresentation}, // E0.7 [1] (😐) neutral face
{0x1F611, 0x1F611, prEmojiPresentation}, // E1.0 [1] (😑) expressionless face
{0x1F612, 0x1F614, prEmojiPresentation}, // E0.6 [3] (😒..😔) unamused face..pensive face
{0x1F615, 0x1F615, prEmojiPresentation}, // E1.0 [1] (😕) confused face
{0x1F616, 0x1F616, prEmojiPresentation}, // E0.6 [1] (😖) confounded face
{0x1F617, 0x1F617, prEmojiPresentation}, // E1.0 [1] (😗) kissing face
{0x1F618, 0x1F618, prEmojiPresentation}, // E0.6 [1] (😘) face blowing a kiss
{0x1F619, 0x1F619, prEmojiPresentation}, // E1.0 [1] (😙) kissing face with smiling eyes
{0x1F61A, 0x1F61A, prEmojiPresentation}, // E0.6 [1] (😚) kissing face with closed eyes
{0x1F61B, 0x1F61B, prEmojiPresentation}, // E1.0 [1] (😛) face with tongue
{0x1F61C, 0x1F61E, prEmojiPresentation}, // E0.6 [3] (😜..😞) winking face with tongue..disappointed face
{0x1F61F, 0x1F61F, prEmojiPresentation}, // E1.0 [1] (😟) worried face
{0x1F620, 0x1F625, prEmojiPresentation}, // E0.6 [6] (😠..😥) angry face..sad but relieved face
{0x1F626, 0x1F627, prEmojiPresentation}, // E1.0 [2] (😦..😧) frowning face with open mouth..anguished face
{0x1F628, 0x1F62B, prEmojiPresentation}, // E0.6 [4] (😨..😫) fearful face..tired face
{0x1F62C, 0x1F62C, prEmojiPresentation}, // E1.0 [1] (😬) grimacing face
{0x1F62D, 0x1F62D, prEmojiPresentation}, // E0.6 [1] (😭) loudly crying face
{0x1F62E, 0x1F62F, prEmojiPresentation}, // E1.0 [2] (😮..😯) face with open mouth..hushed face
{0x1F630, 0x1F633, prEmojiPresentation}, // E0.6 [4] (😰..😳) anxious face with sweat..flushed face
{0x1F634, 0x1F634, prEmojiPresentation}, // E1.0 [1] (😴) sleeping face
{0x1F635, 0x1F635, prEmojiPresentation}, // E0.6 [1] (😵) face with crossed-out eyes
{0x1F636, 0x1F636, prEmojiPresentation}, // E1.0 [1] (😶) face without mouth
{0x1F637, 0x1F640, prEmojiPresentation}, // E0.6 [10] (😷..🙀) face with medical mask..weary cat
{0x1F641, 0x1F644, prEmojiPresentation}, // E1.0 [4] (🙁..🙄) slightly frowning face..face with rolling eyes
{0x1F645, 0x1F64F, prEmojiPresentation}, // E0.6 [11] (🙅..🙏) person gesturing NO..folded hands
{0x1F680, 0x1F680, prEmojiPresentation}, // E0.6 [1] (🚀) rocket
{0x1F681, 0x1F682, prEmojiPresentation}, // E1.0 [2] (🚁..🚂) helicopter..locomotive
{0x1F683, 0x1F685, prEmojiPresentation}, // E0.6 [3] (🚃..🚅) railway car..bullet train
{0x1F686, 0x1F686, prEmojiPresentation}, // E1.0 [1] (🚆) train
{0x1F687, 0x1F687, prEmojiPresentation}, // E0.6 [1] (🚇) metro
{0x1F688, 0x1F688, prEmojiPresentation}, // E1.0 [1] (🚈) light rail
{0x1F689, 0x1F689, prEmojiPresentation}, // E0.6 [1] (🚉) station
{0x1F68A, 0x1F68B, prEmojiPresentation}, // E1.0 [2] (🚊..🚋) tram..tram car
{0x1F68C, 0x1F68C, prEmojiPresentation}, // E0.6 [1] (🚌) bus
{0x1F68D, 0x1F68D, prEmojiPresentation}, // E0.7 [1] (🚍) oncoming bus
{0x1F68E, 0x1F68E, prEmojiPresentation}, // E1.0 [1] (🚎) trolleybus
{0x1F68F, 0x1F68F, prEmojiPresentation}, // E0.6 [1] (🚏) bus stop
{0x1F690, 0x1F690, prEmojiPresentation}, // E1.0 [1] (🚐) minibus
{0x1F691, 0x1F693, prEmojiPresentation}, // E0.6 [3] (🚑..🚓) ambulance..police car
{0x1F694, 0x1F694, prEmojiPresentation}, // E0.7 [1] (🚔) oncoming police car
{0x1F695, 0x1F695, prEmojiPresentation}, // E0.6 [1] (🚕) taxi
{0x1F696, 0x1F696, prEmojiPresentation}, // E1.0 [1] (🚖) oncoming taxi
{0x1F697, 0x1F697, prEmojiPresentation}, // E0.6 [1] (🚗) automobile
{0x1F698, 0x1F698, prEmojiPresentation}, // E0.7 [1] (🚘) oncoming automobile
{0x1F699, 0x1F69A, prEmojiPresentation}, // E0.6 [2] (🚙..🚚) sport utility vehicle..delivery truck
{0x1F69B, 0x1F6A1, prEmojiPresentation}, // E1.0 [7] (🚛..🚡) articulated lorry..aerial tramway
{0x1F6A2, 0x1F6A2, prEmojiPresentation}, // E0.6 [1] (🚢) ship
{0x1F6A3, 0x1F6A3, prEmojiPresentation}, // E1.0 [1] (🚣) person rowing boat
{0x1F6A4, 0x1F6A5, prEmojiPresentation}, // E0.6 [2] (🚤..🚥) speedboat..horizontal traffic light
{0x1F6A6, 0x1F6A6, prEmojiPresentation}, // E1.0 [1] (🚦) vertical traffic light
{0x1F6A7, 0x1F6AD, prEmojiPresentation}, // E0.6 [7] (🚧..🚭) construction..no smoking
{0x1F6AE, 0x1F6B1, prEmojiPresentation}, // E1.0 [4] (🚮..🚱) litter in bin sign..non-potable water
{0x1F6B2, 0x1F6B2, prEmojiPresentation}, // E0.6 [1] (🚲) bicycle
{0x1F6B3, 0x1F6B5, prEmojiPresentation}, // E1.0 [3] (🚳..🚵) no bicycles..person mountain biking
{0x1F6B6, 0x1F6B6, prEmojiPresentation}, // E0.6 [1] (🚶) person walking
{0x1F6B7, 0x1F6B8, prEmojiPresentation}, // E1.0 [2] (🚷..🚸) no pedestrians..children crossing
{0x1F6B9, 0x1F6BE, prEmojiPresentation}, // E0.6 [6] (🚹..🚾) mens room..water closet
{0x1F6BF, 0x1F6BF, prEmojiPresentation}, // E1.0 [1] (🚿) shower
{0x1F6C0, 0x1F6C0, prEmojiPresentation}, // E0.6 [1] (🛀) person taking bath
{0x1F6C1, 0x1F6C5, prEmojiPresentation}, // E1.0 [5] (🛁..🛅) bathtub..left luggage
{0x1F6CC, 0x1F6CC, prEmojiPresentation}, // E1.0 [1] (🛌) person in bed
{0x1F6D0, 0x1F6D0, prEmojiPresentation}, // E1.0 [1] (🛐) place of worship
{0x1F6D1, 0x1F6D2, prEmojiPresentation}, // E3.0 [2] (🛑..🛒) stop sign..shopping cart
{0x1F6D5, 0x1F6D5, prEmojiPresentation}, // E12.0 [1] (🛕) hindu temple
{0x1F6D6, 0x1F6D7, prEmojiPresentation}, // E13.0 [2] (🛖..🛗) hut..elevator
{0x1F6DC, 0x1F6DC, prEmojiPresentation}, // E15.0 [1] (🛜) wireless
{0x1F6DD, 0x1F6DF, prEmojiPresentation}, // E14.0 [3] (🛝..🛟) playground slide..ring buoy
{0x1F6EB, 0x1F6EC, prEmojiPresentation}, // E1.0 [2] (🛫..🛬) airplane departure..airplane arrival
{0x1F6F4, 0x1F6F6, prEmojiPresentation}, // E3.0 [3] (🛴..🛶) kick scooter..canoe
{0x1F6F7, 0x1F6F8, prEmojiPresentation}, // E5.0 [2] (🛷..🛸) sled..flying saucer
{0x1F6F9, 0x1F6F9, prEmojiPresentation}, // E11.0 [1] (🛹) skateboard
{0x1F6FA, 0x1F6FA, prEmojiPresentation}, // E12.0 [1] (🛺) auto rickshaw
{0x1F6FB, 0x1F6FC, prEmojiPresentation}, // E13.0 [2] (🛻..🛼) pickup truck..roller skate
{0x1F7E0, 0x1F7EB, prEmojiPresentation}, // E12.0 [12] (🟠..🟫) orange circle..brown square
{0x1F7F0, 0x1F7F0, prEmojiPresentation}, // E14.0 [1] (🟰) heavy equals sign
{0x1F90C, 0x1F90C, prEmojiPresentation}, // E13.0 [1] (🤌) pinched fingers
{0x1F90D, 0x1F90F, prEmojiPresentation}, // E12.0 [3] (🤍..🤏) white heart..pinching hand
{0x1F910, 0x1F918, prEmojiPresentation}, // E1.0 [9] (🤐..🤘) zipper-mouth face..sign of the horns
{0x1F919, 0x1F91E, prEmojiPresentation}, // E3.0 [6] (🤙..🤞) call me hand..crossed fingers
{0x1F91F, 0x1F91F, prEmojiPresentation}, // E5.0 [1] (🤟) love-you gesture
{0x1F920, 0x1F927, prEmojiPresentation}, // E3.0 [8] (🤠..🤧) cowboy hat face..sneezing face
{0x1F928, 0x1F92F, prEmojiPresentation}, // E5.0 [8] (🤨..🤯) face with raised eyebrow..exploding head
{0x1F930, 0x1F930, prEmojiPresentation}, // E3.0 [1] (🤰) pregnant woman
{0x1F931, 0x1F932, prEmojiPresentation}, // E5.0 [2] (🤱..🤲) breast-feeding..palms up together
{0x1F933, 0x1F93A, prEmojiPresentation}, // E3.0 [8] (🤳..🤺) selfie..person fencing
{0x1F93C, 0x1F93E, prEmojiPresentation}, // E3.0 [3] (🤼..🤾) people wrestling..person playing handball
{0x1F93F, 0x1F93F, prEmojiPresentation}, // E12.0 [1] (🤿) diving mask
{0x1F940, 0x1F945, prEmojiPresentation}, // E3.0 [6] (🥀..🥅) wilted flower..goal net
{0x1F947, 0x1F94B, prEmojiPresentation}, // E3.0 [5] (🥇..🥋) 1st place medal..martial arts uniform
{0x1F94C, 0x1F94C, prEmojiPresentation}, // E5.0 [1] (🥌) curling stone
{0x1F94D, 0x1F94F, prEmojiPresentation}, // E11.0 [3] (🥍..🥏) lacrosse..flying disc
{0x1F950, 0x1F95E, prEmojiPresentation}, // E3.0 [15] (🥐..🥞) croissant..pancakes
{0x1F95F, 0x1F96B, prEmojiPresentation}, // E5.0 [13] (🥟..🥫) dumpling..canned food
{0x1F96C, 0x1F970, prEmojiPresentation}, // E11.0 [5] (🥬..🥰) leafy green..smiling face with hearts
{0x1F971, 0x1F971, prEmojiPresentation}, // E12.0 [1] (🥱) yawning face
{0x1F972, 0x1F972, prEmojiPresentation}, // E13.0 [1] (🥲) smiling face with tear
{0x1F973, 0x1F976, prEmojiPresentation}, // E11.0 [4] (🥳..🥶) partying face..cold face
{0x1F977, 0x1F978, prEmojiPresentation}, // E13.0 [2] (🥷..🥸) ninja..disguised face
{0x1F979, 0x1F979, prEmojiPresentation}, // E14.0 [1] (🥹) face holding back tears
{0x1F97A, 0x1F97A, prEmojiPresentation}, // E11.0 [1] (🥺) pleading face
{0x1F97B, 0x1F97B, prEmojiPresentation}, // E12.0 [1] (🥻) sari
{0x1F97C, 0x1F97F, prEmojiPresentation}, // E11.0 [4] (🥼..🥿) lab coat..flat shoe
{0x1F980, 0x1F984, prEmojiPresentation}, // E1.0 [5] (🦀..🦄) crab..unicorn
{0x1F985, 0x1F991, prEmojiPresentation}, // E3.0 [13] (🦅..🦑) eagle..squid
{0x1F992, 0x1F997, prEmojiPresentation}, // E5.0 [6] (🦒..🦗) giraffe..cricket
{0x1F998, 0x1F9A2, prEmojiPresentation}, // E11.0 [11] (🦘..🦢) kangaroo..swan
{0x1F9A3, 0x1F9A4, prEmojiPresentation}, // E13.0 [2] (🦣..🦤) mammoth..dodo
{0x1F9A5, 0x1F9AA, prEmojiPresentation}, // E12.0 [6] (🦥..🦪) sloth..oyster
{0x1F9AB, 0x1F9AD, prEmojiPresentation}, // E13.0 [3] (🦫..🦭) beaver..seal
{0x1F9AE, 0x1F9AF, prEmojiPresentation}, // E12.0 [2] (🦮..🦯) guide dog..white cane
{0x1F9B0, 0x1F9B9, prEmojiPresentation}, // E11.0 [10] (🦰..🦹) red hair..supervillain
{0x1F9BA, 0x1F9BF, prEmojiPresentation}, // E12.0 [6] (🦺..🦿) safety vest..mechanical leg
{0x1F9C0, 0x1F9C0, prEmojiPresentation}, // E1.0 [1] (🧀) cheese wedge
{0x1F9C1, 0x1F9C2, prEmojiPresentation}, // E11.0 [2] (🧁..🧂) cupcake..salt
{0x1F9C3, 0x1F9CA, prEmojiPresentation}, // E12.0 [8] (🧃..🧊) beverage box..ice
{0x1F9CB, 0x1F9CB, prEmojiPresentation}, // E13.0 [1] (🧋) bubble tea
{0x1F9CC, 0x1F9CC, prEmojiPresentation}, // E14.0 [1] (🧌) troll
{0x1F9CD, 0x1F9CF, prEmojiPresentation}, // E12.0 [3] (🧍..🧏) person standing..deaf person
{0x1F9D0, 0x1F9E6, prEmojiPresentation}, // E5.0 [23] (🧐..🧦) face with monocle..socks
{0x1F9E7, 0x1F9FF, prEmojiPresentation}, // E11.0 [25] (🧧..🧿) red envelope..nazar amulet
{0x1FA70, 0x1FA73, prEmojiPresentation}, // E12.0 [4] (🩰..🩳) ballet shoes..shorts
{0x1FA74, 0x1FA74, prEmojiPresentation}, // E13.0 [1] (🩴) thong sandal
{0x1FA75, 0x1FA77, prEmojiPresentation}, // E15.0 [3] (🩵..🩷) light blue heart..pink heart
{0x1FA78, 0x1FA7A, prEmojiPresentation}, // E12.0 [3] (🩸..🩺) drop of blood..stethoscope
{0x1FA7B, 0x1FA7C, prEmojiPresentation}, // E14.0 [2] (🩻..🩼) x-ray..crutch
{0x1FA80, 0x1FA82, prEmojiPresentation}, // E12.0 [3] (🪀..🪂) yo-yo..parachute
{0x1FA83, 0x1FA86, prEmojiPresentation}, // E13.0 [4] (🪃..🪆) boomerang..nesting dolls
{0x1FA87, 0x1FA88, prEmojiPresentation}, // E15.0 [2] (🪇..🪈) maracas..flute
{0x1FA90, 0x1FA95, prEmojiPresentation}, // E12.0 [6] (🪐..🪕) ringed planet..banjo
{0x1FA96, 0x1FAA8, prEmojiPresentation}, // E13.0 [19] (🪖..🪨) military helmet..rock
{0x1FAA9, 0x1FAAC, prEmojiPresentation}, // E14.0 [4] (🪩..🪬) mirror ball..hamsa
{0x1FAAD, 0x1FAAF, prEmojiPresentation}, // E15.0 [3] (🪭..🪯) folding hand fan..khanda
{0x1FAB0, 0x1FAB6, prEmojiPresentation}, // E13.0 [7] (🪰..🪶) fly..feather
{0x1FAB7, 0x1FABA, prEmojiPresentation}, // E14.0 [4] (🪷..🪺) lotus..nest with eggs
{0x1FABB, 0x1FABD, prEmojiPresentation}, // E15.0 [3] (🪻..🪽) hyacinth..wing
{0x1FABF, 0x1FABF, prEmojiPresentation}, // E15.0 [1] (🪿) goose
{0x1FAC0, 0x1FAC2, prEmojiPresentation}, // E13.0 [3] (🫀..🫂) anatomical heart..people hugging
{0x1FAC3, 0x1FAC5, prEmojiPresentation}, // E14.0 [3] (🫃..🫅) pregnant man..person with crown
{0x1FACE, 0x1FACF, prEmojiPresentation}, // E15.0 [2] (🫎..🫏) moose..donkey
{0x1FAD0, 0x1FAD6, prEmojiPresentation}, // E13.0 [7] (🫐..🫖) blueberries..teapot
{0x1FAD7, 0x1FAD9, prEmojiPresentation}, // E14.0 [3] (🫗..🫙) pouring liquid..jar
{0x1FADA, 0x1FADB, prEmojiPresentation}, // E15.0 [2] (🫚..🫛) ginger root..pea pod
{0x1FAE0, 0x1FAE7, prEmojiPresentation}, // E14.0 [8] (🫠..🫧) melting face..bubbles
{0x1FAE8, 0x1FAE8, prEmojiPresentation}, // E15.0 [1] (🫨) shaking face
{0x1FAF0, 0x1FAF6, prEmojiPresentation}, // E14.0 [7] (🫰..🫶) hand with index finger and thumb crossed..heart hands
{0x1FAF7, 0x1FAF8, prEmojiPresentation}, // E15.0 [2] (🫷..🫸) leftwards pushing hand..rightwards pushing hand
}

215
vendor/github.com/rivo/uniseg/gen_breaktest.go generated vendored Normal file
View File

@ -0,0 +1,215 @@
//go:build generate
// This program generates a Go containing a slice of test cases based on the
// Unicode Character Database auxiliary data files. The command line arguments
// are as follows:
//
// 1. The name of the Unicode data file (just the filename, without extension).
// 2. The name of the locally generated Go file.
// 3. The name of the slice containing the test cases.
// 4. The name of the generator, for logging purposes.
//
//go:generate go run gen_breaktest.go GraphemeBreakTest graphemebreak_test.go graphemeBreakTestCases graphemes
//go:generate go run gen_breaktest.go WordBreakTest wordbreak_test.go wordBreakTestCases words
//go:generate go run gen_breaktest.go SentenceBreakTest sentencebreak_test.go sentenceBreakTestCases sentences
//go:generate go run gen_breaktest.go LineBreakTest linebreak_test.go lineBreakTestCases lines
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"go/format"
"io/ioutil"
"log"
"net/http"
"os"
"time"
)
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
testCaseURL = `https://www.unicode.org/Public/15.0.0/ucd/auxiliary/%s.txt`
)
func main() {
if len(os.Args) < 5 {
fmt.Println("Not enough arguments, see code for details")
os.Exit(1)
}
log.SetPrefix("gen_breaktest (" + os.Args[4] + "): ")
log.SetFlags(0)
// Read text of testcases and parse into Go source code.
src, err := parse(fmt.Sprintf(testCaseURL, os.Args[1]))
if err != nil {
log.Fatal(err)
}
// Format the Go code.
formatted, err := format.Source(src)
if err != nil {
log.Fatalln("gofmt:", err)
}
// Write it out.
log.Print("Writing to ", os.Args[2])
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
log.Fatal(err)
}
}
// parse reads a break text file, either from a local file or from a URL. It
// parses the file data into Go source code representing the test cases.
func parse(url string) ([]byte, error) {
log.Printf("Parsing %s", url)
res, err := http.Get(url)
if err != nil {
return nil, err
}
body := res.Body
defer body.Close()
buf := new(bytes.Buffer)
buf.Grow(120 << 10)
buf.WriteString(`// Code generated via go generate from gen_breaktest.go. DO NOT EDIT.
package uniseg
// ` + os.Args[3] + ` are Grapheme testcases taken from
// ` + url + `
// on ` + time.Now().Format("January 2, 2006") + `. See
// https://www.unicode.org/license.html for the Unicode license agreement.
var ` + os.Args[3] + ` = []testCase {
`)
sc := bufio.NewScanner(body)
num := 1
var line []byte
original := make([]byte, 0, 64)
expected := make([]byte, 0, 64)
for sc.Scan() {
num++
line = sc.Bytes()
if len(line) == 0 || line[0] == '#' {
continue
}
var comment []byte
if i := bytes.IndexByte(line, '#'); i >= 0 {
comment = bytes.TrimSpace(line[i+1:])
line = bytes.TrimSpace(line[:i])
}
original, expected, err := parseRuneSequence(line, original[:0], expected[:0])
if err != nil {
return nil, fmt.Errorf(`line %d: %v: %q`, num, err, line)
}
fmt.Fprintf(buf, "\t{original: \"%s\", expected: %s}, // %s\n", original, expected, comment)
}
if err := sc.Err(); err != nil {
return nil, err
}
// Check for final "# EOF", useful check if we're streaming via HTTP
if !bytes.Equal(line, []byte("# EOF")) {
return nil, fmt.Errorf(`line %d: exected "# EOF" as final line, got %q`, num, line)
}
buf.WriteString("}\n")
return buf.Bytes(), nil
}
// Used by parseRuneSequence to match input via bytes.HasPrefix.
var (
prefixBreak = []byte("÷ ")
prefixDontBreak = []byte("× ")
breakOk = []byte("÷")
breakNo = []byte("×")
)
// parseRuneSequence parses a rune + breaking opportunity sequence from b
// and appends the Go code for testcase.original to orig
// and appends the Go code for testcase.expected to exp.
// It retuns the new orig and exp slices.
//
// E.g. for the input b="÷ 0020 × 0308 ÷ 1F1E6 ÷"
// it will append
//
// "\u0020\u0308\U0001F1E6"
//
// and "[][]rune{{0x0020,0x0308},{0x1F1E6},}"
// to orig and exp respectively.
//
// The formatting of exp is expected to be cleaned up by gofmt or format.Source.
// Note we explicitly require the sequence to start with ÷ and we implicitly
// require it to end with ÷.
func parseRuneSequence(b, orig, exp []byte) ([]byte, []byte, error) {
// Check for and remove first ÷ or ×.
if !bytes.HasPrefix(b, prefixBreak) && !bytes.HasPrefix(b, prefixDontBreak) {
return nil, nil, errors.New("expected ÷ or × as first character")
}
if bytes.HasPrefix(b, prefixBreak) {
b = b[len(prefixBreak):]
} else {
b = b[len(prefixDontBreak):]
}
boundary := true
exp = append(exp, "[][]rune{"...)
for len(b) > 0 {
if boundary {
exp = append(exp, '{')
}
exp = append(exp, "0x"...)
// Find end of hex digits.
var i int
for i = 0; i < len(b) && b[i] != ' '; i++ {
if d := b[i]; ('0' <= d || d <= '9') ||
('A' <= d || d <= 'F') ||
('a' <= d || d <= 'f') {
continue
}
return nil, nil, errors.New("bad hex digit")
}
switch i {
case 4:
orig = append(orig, "\\u"...)
case 5:
orig = append(orig, "\\U000"...)
default:
return nil, nil, errors.New("unsupport code point hex length")
}
orig = append(orig, b[:i]...)
exp = append(exp, b[:i]...)
b = b[i:]
// Check for space between hex and ÷ or ×.
if len(b) < 1 || b[0] != ' ' {
return nil, nil, errors.New("bad input")
}
b = b[1:]
// Check for next boundary.
switch {
case bytes.HasPrefix(b, breakOk):
boundary = true
b = b[len(breakOk):]
case bytes.HasPrefix(b, breakNo):
boundary = false
b = b[len(breakNo):]
default:
return nil, nil, errors.New("missing ÷ or ×")
}
if boundary {
exp = append(exp, '}')
}
exp = append(exp, ',')
if len(b) > 0 && b[0] == ' ' {
b = b[1:]
}
}
exp = append(exp, '}')
return orig, exp, nil
}

261
vendor/github.com/rivo/uniseg/gen_properties.go generated vendored Normal file
View File

@ -0,0 +1,261 @@
//go:build generate
// This program generates a property file in Go file from Unicode Character
// Database auxiliary data files. The command line arguments are as follows:
//
// 1. The name of the Unicode data file (just the filename, without extension).
// Can be "-" (to skip) if the emoji flag is included.
// 2. The name of the locally generated Go file.
// 3. The name of the slice mapping code points to properties.
// 4. The name of the generator, for logging purposes.
// 5. (Optional) Flags, comma-separated. The following flags are available:
// - "emojis=<property>": include the specified emoji properties (e.g.
// "Extended_Pictographic").
// - "gencat": include general category properties.
//
//go:generate go run gen_properties.go auxiliary/GraphemeBreakProperty graphemeproperties.go graphemeCodePoints graphemes emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/WordBreakProperty wordproperties.go workBreakCodePoints words emojis=Extended_Pictographic
//go:generate go run gen_properties.go auxiliary/SentenceBreakProperty sentenceproperties.go sentenceBreakCodePoints sentences
//go:generate go run gen_properties.go LineBreak lineproperties.go lineBreakCodePoints lines gencat
//go:generate go run gen_properties.go EastAsianWidth eastasianwidth.go eastAsianWidth eastasianwidth
//go:generate go run gen_properties.go - emojipresentation.go emojiPresentation emojipresentation emojis=Emoji_Presentation
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"go/format"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"sort"
"strconv"
"strings"
"time"
)
// We want to test against a specific version rather than the latest. When the
// package is upgraded to a new version, change these to generate new tests.
const (
propertyURL = `https://www.unicode.org/Public/15.0.0/ucd/%s.txt`
emojiURL = `https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt`
)
// The regular expression for a line containing a code point range property.
var propertyPattern = regexp.MustCompile(`^([0-9A-F]{4,6})(\.\.([0-9A-F]{4,6}))?\s*;\s*([A-Za-z0-9_]+)\s*#\s(.+)$`)
func main() {
if len(os.Args) < 5 {
fmt.Println("Not enough arguments, see code for details")
os.Exit(1)
}
log.SetPrefix("gen_properties (" + os.Args[4] + "): ")
log.SetFlags(0)
// Parse flags.
flags := make(map[string]string)
if len(os.Args) >= 6 {
for _, flag := range strings.Split(os.Args[5], ",") {
flagFields := strings.Split(flag, "=")
if len(flagFields) == 1 {
flags[flagFields[0]] = "yes"
} else {
flags[flagFields[0]] = flagFields[1]
}
}
}
// Parse the text file and generate Go source code from it.
_, includeGeneralCategory := flags["gencat"]
var mainURL string
if os.Args[1] != "-" {
mainURL = fmt.Sprintf(propertyURL, os.Args[1])
}
src, err := parse(mainURL, flags["emojis"], includeGeneralCategory)
if err != nil {
log.Fatal(err)
}
// Format the Go code.
formatted, err := format.Source([]byte(src))
if err != nil {
log.Fatal("gofmt:", err)
}
// Save it to the (local) target file.
log.Print("Writing to ", os.Args[2])
if err := ioutil.WriteFile(os.Args[2], formatted, 0644); err != nil {
log.Fatal(err)
}
}
// parse parses the Unicode Properties text files located at the given URLs and
// returns their equivalent Go source code to be used in the uniseg package. If
// "emojiProperty" is not an empty string, emoji code points for that emoji
// property (e.g. "Extended_Pictographic") will be included. In those cases, you
// may pass an empty "propertyURL" to skip parsing the main properties file. If
// "includeGeneralCategory" is true, the Unicode General Category property will
// be extracted from the comments and included in the output.
func parse(propertyURL, emojiProperty string, includeGeneralCategory bool) (string, error) {
if propertyURL == "" && emojiProperty == "" {
return "", errors.New("no properties to parse")
}
// Temporary buffer to hold properties.
var properties [][4]string
// Open the first URL.
if propertyURL != "" {
log.Printf("Parsing %s", propertyURL)
res, err := http.Get(propertyURL)
if err != nil {
return "", err
}
in1 := res.Body
defer in1.Close()
// Parse it.
scanner := bufio.NewScanner(in1)
num := 0
for scanner.Scan() {
num++
line := strings.TrimSpace(scanner.Text())
// Skip comments and empty lines.
if strings.HasPrefix(line, "#") || line == "" {
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if err != nil {
return "", fmt.Errorf("%s line %d: %v", os.Args[4], num, err)
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
}
}
// Open the second URL.
if emojiProperty != "" {
log.Printf("Parsing %s", emojiURL)
res, err := http.Get(emojiURL)
if err != nil {
return "", err
}
in2 := res.Body
defer in2.Close()
// Parse it.
scanner := bufio.NewScanner(in2)
num := 0
for scanner.Scan() {
num++
line := scanner.Text()
// Skip comments, empty lines, and everything not containing
// "Extended_Pictographic".
if strings.HasPrefix(line, "#") || line == "" || !strings.Contains(line, emojiProperty) {
continue
}
// Everything else must be a code point range, a property and a comment.
from, to, property, comment, err := parseProperty(line)
if err != nil {
return "", fmt.Errorf("emojis line %d: %v", num, err)
}
properties = append(properties, [4]string{from, to, property, comment})
}
if err := scanner.Err(); err != nil {
return "", err
}
}
// Avoid overflow during binary search.
if len(properties) >= 1<<31 {
return "", errors.New("too many properties")
}
// Sort properties.
sort.Slice(properties, func(i, j int) bool {
left, _ := strconv.ParseUint(properties[i][0], 16, 64)
right, _ := strconv.ParseUint(properties[j][0], 16, 64)
return left < right
})
// Header.
var (
buf bytes.Buffer
emojiComment string
)
columns := 3
if includeGeneralCategory {
columns = 4
}
if emojiURL != "" {
emojiComment = `
// and
// ` + emojiURL + `
// ("Extended_Pictographic" only)`
}
buf.WriteString(`// Code generated via go generate from gen_properties.go. DO NOT EDIT.
package uniseg
// ` + os.Args[3] + ` are taken from
// ` + propertyURL + emojiComment + `
// on ` + time.Now().Format("January 2, 2006") + `. See https://www.unicode.org/license.html for the Unicode
// license agreement.
var ` + os.Args[3] + ` = [][` + strconv.Itoa(columns) + `]int{
`)
// Properties.
for _, prop := range properties {
if includeGeneralCategory {
generalCategory := "gc" + prop[3][:2]
if generalCategory == "gcL&" {
generalCategory = "gcLC"
}
prop[3] = prop[3][3:]
fmt.Fprintf(&buf, "{0x%s,0x%s,%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), generalCategory, prop[3])
} else {
fmt.Fprintf(&buf, "{0x%s,0x%s,%s}, // %s\n", prop[0], prop[1], translateProperty("pr", prop[2]), prop[3])
}
}
// Tail.
buf.WriteString("}")
return buf.String(), nil
}
// parseProperty parses a line of the Unicode properties text file containing a
// property for a code point range and returns it along with its comment.
func parseProperty(line string) (from, to, property, comment string, err error) {
fields := propertyPattern.FindStringSubmatch(line)
if fields == nil {
err = errors.New("no property found")
return
}
from = fields[1]
to = fields[3]
if to == "" {
to = from
}
property = fields[4]
comment = fields[5]
return
}
// translateProperty translates a property name as used in the Unicode data file
// to a variable used in the Go code.
func translateProperty(prefix, property string) string {
return prefix + strings.ReplaceAll(property, "_", "")
}

331
vendor/github.com/rivo/uniseg/grapheme.go generated vendored Normal file
View File

@ -0,0 +1,331 @@
package uniseg
import "unicode/utf8"
// Graphemes implements an iterator over Unicode grapheme clusters, or
// user-perceived characters. While iterating, it also provides information
// about word boundaries, sentence boundaries, line breaks, and monospace
// character widths.
//
// After constructing the class via [NewGraphemes] for a given string "str",
// [Graphemes.Next] is called for every grapheme cluster in a loop until it
// returns false. Inside the loop, information about the grapheme cluster as
// well as boundary information and character width is available via the various
// methods (see examples below).
//
// This class basically wraps the [StepString] parser and provides a convenient
// interface to it. If you are only interested in some parts of this package's
// functionality, using the specialized functions starting with "First" is
// almost always faster.
type Graphemes struct {
// The original string.
original string
// The remaining string to be parsed.
remaining string
// The current grapheme cluster.
cluster string
// The byte offset of the current grapheme cluster relative to the original
// string.
offset int
// The current boundary information of the [Step] parser.
boundaries int
// The current state of the [Step] parser.
state int
}
// NewGraphemes returns a new grapheme cluster iterator.
func NewGraphemes(str string) *Graphemes {
return &Graphemes{
original: str,
remaining: str,
state: -1,
}
}
// Next advances the iterator by one grapheme cluster and returns false if no
// clusters are left. This function must be called before the first cluster is
// accessed.
func (g *Graphemes) Next() bool {
if len(g.remaining) == 0 {
// We're already past the end.
g.state = -2
g.cluster = ""
return false
}
g.offset += len(g.cluster)
g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state)
return true
}
// Runes returns a slice of runes (code points) which corresponds to the current
// grapheme cluster. If the iterator is already past the end or [Graphemes.Next]
// has not yet been called, nil is returned.
func (g *Graphemes) Runes() []rune {
if g.state < 0 {
return nil
}
return []rune(g.cluster)
}
// Str returns a substring of the original string which corresponds to the
// current grapheme cluster. If the iterator is already past the end or
// [Graphemes.Next] has not yet been called, an empty string is returned.
func (g *Graphemes) Str() string {
return g.cluster
}
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
// If the iterator is already past the end or [Graphemes.Next] has not yet been
// called, nil is returned.
func (g *Graphemes) Bytes() []byte {
if g.state < 0 {
return nil
}
return []byte(g.cluster)
}
// Positions returns the interval of the current grapheme cluster as byte
// positions into the original string. The first returned value "from" indexes
// the first byte and the second returned value "to" indexes the first byte that
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
// the original string "str". If [Graphemes.Next] has not yet been called, both
// values are 0. If the iterator is already past the end, both values are 1.
func (g *Graphemes) Positions() (int, int) {
if g.state == -1 {
return 0, 0
} else if g.state == -2 {
return 1, 1
}
return g.offset, g.offset + len(g.cluster)
}
// IsWordBoundary returns true if a word ends after the current grapheme
// cluster.
func (g *Graphemes) IsWordBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskWord != 0
}
// IsSentenceBoundary returns true if a sentence ends after the current
// grapheme cluster.
func (g *Graphemes) IsSentenceBoundary() bool {
if g.state < 0 {
return true
}
return g.boundaries&MaskSentence != 0
}
// LineBreak returns whether the line can be broken after the current grapheme
// cluster. A value of [LineDontBreak] means the line may not be broken, a value
// of [LineMustBreak] means the line must be broken, and a value of
// [LineCanBreak] means the line may or may not be broken.
func (g *Graphemes) LineBreak() int {
if g.state == -1 {
return LineDontBreak
}
if g.state == -2 {
return LineMustBreak
}
return g.boundaries & MaskLine
}
// Width returns the monospace width of the current grapheme cluster.
func (g *Graphemes) Width() int {
if g.state < 0 {
return 0
}
return g.boundaries >> ShiftWidth
}
// Reset puts the iterator into its initial state such that the next call to
// [Graphemes.Next] sets it to the first grapheme cluster again.
func (g *Graphemes) Reset() {
g.state = -1
g.offset = 0
g.cluster = ""
g.remaining = g.original
}
// GraphemeClusterCount returns the number of user-perceived characters
// (grapheme clusters) for the given string.
func GraphemeClusterCount(s string) (n int) {
state := -1
for len(s) > 0 {
_, s, _, state = FirstGraphemeClusterInString(s, state)
n++
}
return
}
// ReverseString reverses the given string while observing grapheme cluster
// boundaries.
func ReverseString(s string) string {
str := []byte(s)
reversed := make([]byte, len(str))
state := -1
index := len(str)
for len(str) > 0 {
var cluster []byte
cluster, str, _, state = FirstGraphemeCluster(str, state)
index -= len(cluster)
copy(reversed[index:], cluster)
if index <= len(str)/2 {
break
}
}
return string(reversed)
}
// The number of bits the grapheme property must be shifted to make place for
// grapheme states.
const shiftGraphemePropState = 4
// FirstGraphemeCluster returns the first grapheme cluster found in the given
// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme
// Cluster Boundaries]. This function can be called continuously to extract all
// grapheme clusters from a byte slice, as illustrated in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified grapheme cluster. If the length of the
// "rest" slice is 0, the entire byte slice "b" has been processed. The
// "cluster" byte slice is the sub-slice of the input slice containing the
// identified grapheme cluster.
//
// The returned width is the width of the grapheme cluster for most monospace
// fonts where a value of 1 represents one character cell.
//
// Given an empty byte slice "b", the function returns nil values.
//
// While slightly less convenient than using the Graphemes class, this function
// has much better performance and makes no allocations. It lends itself well to
// large byte slices.
//
// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}
return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRune(b[length:])
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return b[:length], b[length:], width, state | (prop << shiftGraphemePropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(b) <= length {
return b, nil, width, grAny | (prop << shiftGraphemePropState)
}
}
}
// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and
// outputs are strings.
func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) {
// An empty string returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = propertyGraphemes(r)
} else {
prop = state >> shiftGraphemePropState
}
return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState)
}
// If we don't know the state, determine it now.
var firstProp int
if state < 0 {
state, firstProp, _ = transitionGraphemeState(state, r)
} else {
firstProp = state >> shiftGraphemePropState
}
width += runeWidth(r, firstProp)
// Transition until we find a boundary.
for {
var (
prop int
boundary bool
)
r, l := utf8.DecodeRuneInString(str[length:])
state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r)
if boundary {
return str[:length], str[length:], width, state | (prop << shiftGraphemePropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(str) <= length {
return str, "", width, grAny | (prop << shiftGraphemePropState)
}
}
}

1915
vendor/github.com/rivo/uniseg/graphemeproperties.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

176
vendor/github.com/rivo/uniseg/graphemerules.go generated vendored Normal file
View File

@ -0,0 +1,176 @@
package uniseg
// The states of the grapheme cluster parser.
const (
grAny = iota
grCR
grControlLF
grL
grLVV
grLVTT
grPrepend
grExtendedPictographic
grExtendedPictographicZWJ
grRIOdd
grRIEven
)
// The grapheme cluster parser's breaking instructions.
const (
grNoBoundary = iota
grBoundary
)
// grTransitions implements the grapheme cluster parser's state transitions.
// Maps state and property to a new state, a breaking instruction, and rule
// number. The breaking instruction always refers to the boundary between the
// last and next code point. Returns negative values if no transition is found.
//
// This function is used as follows:
//
// 1. Find specific state + specific property. Stop if found.
// 2. Find specific state + any property.
// 3. Find any state + specific property.
// 4. If only (2) or (3) (but not both) was found, stop.
// 5. If both (2) and (3) were found, use state from (3) and breaking instruction
// from the transition with the lower rule number, prefer (3) if rule numbers
// are equal. Stop.
// 6. Assume grAny and grBoundary.
//
// Unicode version 15.0.0.
func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
// It turns out that using a big switch statement is much faster than using
// a map.
switch uint64(state) | uint64(prop)<<32 {
// GB5
case grAny | prCR<<32:
return grCR, grBoundary, 50
case grAny | prLF<<32:
return grControlLF, grBoundary, 50
case grAny | prControl<<32:
return grControlLF, grBoundary, 50
// GB4
case grCR | prAny<<32:
return grAny, grBoundary, 40
case grControlLF | prAny<<32:
return grAny, grBoundary, 40
// GB3
case grCR | prLF<<32:
return grControlLF, grNoBoundary, 30
// GB6
case grAny | prL<<32:
return grL, grBoundary, 9990
case grL | prL<<32:
return grL, grNoBoundary, 60
case grL | prV<<32:
return grLVV, grNoBoundary, 60
case grL | prLV<<32:
return grLVV, grNoBoundary, 60
case grL | prLVT<<32:
return grLVTT, grNoBoundary, 60
// GB7
case grAny | prLV<<32:
return grLVV, grBoundary, 9990
case grAny | prV<<32:
return grLVV, grBoundary, 9990
case grLVV | prV<<32:
return grLVV, grNoBoundary, 70
case grLVV | prT<<32:
return grLVTT, grNoBoundary, 70
// GB8
case grAny | prLVT<<32:
return grLVTT, grBoundary, 9990
case grAny | prT<<32:
return grLVTT, grBoundary, 9990
case grLVTT | prT<<32:
return grLVTT, grNoBoundary, 80
// GB9
case grAny | prExtend<<32:
return grAny, grNoBoundary, 90
case grAny | prZWJ<<32:
return grAny, grNoBoundary, 90
// GB9a
case grAny | prSpacingMark<<32:
return grAny, grNoBoundary, 91
// GB9b
case grAny | prPrepend<<32:
return grPrepend, grBoundary, 9990
case grPrepend | prAny<<32:
return grAny, grNoBoundary, 92
// GB11
case grAny | prExtendedPictographic<<32:
return grExtendedPictographic, grBoundary, 9990
case grExtendedPictographic | prExtend<<32:
return grExtendedPictographic, grNoBoundary, 110
case grExtendedPictographic | prZWJ<<32:
return grExtendedPictographicZWJ, grNoBoundary, 110
case grExtendedPictographicZWJ | prExtendedPictographic<<32:
return grExtendedPictographic, grNoBoundary, 110
// GB12 / GB13
case grAny | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 9990
case grRIOdd | prRegionalIndicator<<32:
return grRIEven, grNoBoundary, 120
case grRIEven | prRegionalIndicator<<32:
return grRIOdd, grBoundary, 120
default:
return -1, -1, -1
}
}
// transitionGraphemeState determines the new state of the grapheme cluster
// parser given the current state and the next code point. It also returns the
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
// Determine the property of the next character.
prop = propertyGraphemes(r)
// Find the applicable transition.
nextState, nextProp, _ := grTransitions(state, prop)
if nextState >= 0 {
// We have a specific transition. We'll use it.
return nextState, prop, nextProp == grBoundary
}
// No specific transition found. Try the less specific ones.
anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState = anyStateState
boundary = anyStateProp == grBoundary
if anyPropRule < anyStateRule {
boundary = anyPropProp == grBoundary
}
return
}
if anyPropState >= 0 {
// We only have a specific state.
return anyPropState, prop, anyPropProp == grBoundary
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
}
if anyStateState >= 0 {
// We only have a specific property.
return anyStateState, prop, anyStateProp == grBoundary
}
// No known transition. GB999: Any ÷ Any.
return grAny, prop, true
}

134
vendor/github.com/rivo/uniseg/line.go generated vendored Normal file
View File

@ -0,0 +1,134 @@
package uniseg
import "unicode/utf8"
// FirstLineSegment returns the prefix of the given byte slice after which a
// decision to break the string over to the next line can or must be made,
// according to the rules of [Unicode Standard Annex #14]. This is used to
// implement line breaking.
//
// Line breaking, also known as word wrapping, is the process of breaking a
// section of text into lines such that it will fit in the available width of a
// page, window or other display area.
//
// The returned "segment" may not be broken into smaller parts, unless no other
// breaking opportunities present themselves, in which case you may break by
// grapheme clusters (using the [FirstGraphemeCluster] function to determine the
// grapheme clusters).
//
// The "mustBreak" flag indicates whether you MUST break the line after the
// given segment (true), for example after newline characters, or you MAY break
// the line after the given segment (false).
//
// This function can be called continuously to extract all non-breaking sub-sets
// from a byte slice, as illustrated in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified line segment. If the length of the
// "rest" slice is 0, the entire byte slice "b" has been processed. The
// "segment" byte slice is the sub-slice of the input slice containing the
// identified line segment.
//
// Given an empty byte slice "b", the function returns nil values.
//
// Note that in accordance with [UAX #14 LB3], the final segment will end with
// "mustBreak" set to true. You can choose to ignore this by checking if the
// length of the "rest" slice is 0 and calling [HasTrailingLineBreak] or
// [HasTrailingLineBreakInString] on the last rune.
//
// Note also that this algorithm may break within grapheme clusters. This is
// addressed in Section 8.2 Example 6 of UAX #14. To avoid this, you can use
// the [Step] function instead.
//
// [Unicode Standard Annex #14]: https://www.unicode.org/reports/tr14/
// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
func FirstLineSegment(b []byte, state int) (segment, rest []byte, mustBreak bool, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
return b, nil, true, lbAny // LB3.
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionLineBreakState(state, r, b[length:], "")
}
// Transition until we find a boundary.
var boundary int
for {
r, l := utf8.DecodeRune(b[length:])
state, boundary = transitionLineBreakState(state, r, b[length+l:], "")
if boundary != LineDontBreak {
return b[:length], b[length:], boundary == LineMustBreak, state
}
length += l
if len(b) <= length {
return b, nil, true, lbAny // LB3
}
}
}
// FirstLineSegmentInString is like [FirstLineSegment] but its input and outputs
// are strings.
func FirstLineSegmentInString(str string, state int) (segment, rest string, mustBreak bool, newState int) {
// An empty byte slice returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
return str, "", true, lbAny // LB3.
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionLineBreakState(state, r, nil, str[length:])
}
// Transition until we find a boundary.
var boundary int
for {
r, l := utf8.DecodeRuneInString(str[length:])
state, boundary = transitionLineBreakState(state, r, nil, str[length+l:])
if boundary != LineDontBreak {
return str[:length], str[length:], boundary == LineMustBreak, state
}
length += l
if len(str) <= length {
return str, "", true, lbAny // LB3.
}
}
}
// HasTrailingLineBreak returns true if the last rune in the given byte slice is
// one of the hard line break code points defined in LB4 and LB5 of [UAX #14].
//
// [UAX #14]: https://www.unicode.org/reports/tr14/#Algorithm
func HasTrailingLineBreak(b []byte) bool {
r, _ := utf8.DecodeLastRune(b)
property, _ := propertyLineBreak(r)
return property == prBK || property == prCR || property == prLF || property == prNL
}
// HasTrailingLineBreakInString is like [HasTrailingLineBreak] but for a string.
func HasTrailingLineBreakInString(str string) bool {
r, _ := utf8.DecodeLastRuneInString(str)
property, _ := propertyLineBreak(r)
return property == prBK || property == prCR || property == prLF || property == prNL
}

3554
vendor/github.com/rivo/uniseg/lineproperties.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

626
vendor/github.com/rivo/uniseg/linerules.go generated vendored Normal file
View File

@ -0,0 +1,626 @@
package uniseg
import "unicode/utf8"
// The states of the line break parser.
const (
lbAny = iota
lbBK
lbCR
lbLF
lbNL
lbSP
lbZW
lbWJ
lbGL
lbBA
lbHY
lbCL
lbCP
lbEX
lbIS
lbSY
lbOP
lbQU
lbQUSP
lbNS
lbCLCPSP
lbB2
lbB2SP
lbCB
lbBB
lbLB21a
lbHL
lbAL
lbNU
lbPR
lbEB
lbIDEM
lbNUNU
lbNUSY
lbNUIS
lbNUCL
lbNUCP
lbPO
lbJL
lbJV
lbJT
lbH2
lbH3
lbOddRI
lbEvenRI
lbExtPicCn
lbZWJBit = 64
lbCPeaFWHBit = 128
)
// These constants define whether a given text may be broken into the next line.
// If the break is optional (LineCanBreak), you may choose to break or not based
// on your own criteria, for example, if the text has reached the available
// width.
const (
LineDontBreak = iota // You may not break the line here.
LineCanBreak // You may or may not break the line here.
LineMustBreak // You must break the line here.
)
// lbTransitions implements the line break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func lbTransitions(state, prop int) (newState, lineBreak, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// LB4.
case lbBK | prAny<<32:
return lbAny, LineMustBreak, 40
// LB5.
case lbCR | prLF<<32:
return lbLF, LineDontBreak, 50
case lbCR | prAny<<32:
return lbAny, LineMustBreak, 50
case lbLF | prAny<<32:
return lbAny, LineMustBreak, 50
case lbNL | prAny<<32:
return lbAny, LineMustBreak, 50
// LB6.
case lbAny | prBK<<32:
return lbBK, LineDontBreak, 60
case lbAny | prCR<<32:
return lbCR, LineDontBreak, 60
case lbAny | prLF<<32:
return lbLF, LineDontBreak, 60
case lbAny | prNL<<32:
return lbNL, LineDontBreak, 60
// LB7.
case lbAny | prSP<<32:
return lbSP, LineDontBreak, 70
case lbAny | prZW<<32:
return lbZW, LineDontBreak, 70
// LB8.
case lbZW | prSP<<32:
return lbZW, LineDontBreak, 70
case lbZW | prAny<<32:
return lbAny, LineCanBreak, 80
// LB11.
case lbAny | prWJ<<32:
return lbWJ, LineDontBreak, 110
case lbWJ | prAny<<32:
return lbAny, LineDontBreak, 110
// LB12.
case lbAny | prGL<<32:
return lbGL, LineCanBreak, 310
case lbGL | prAny<<32:
return lbAny, LineDontBreak, 120
// LB13 (simple transitions).
case lbAny | prCL<<32:
return lbCL, LineCanBreak, 310
case lbAny | prCP<<32:
return lbCP, LineCanBreak, 310
case lbAny | prEX<<32:
return lbEX, LineDontBreak, 130
case lbAny | prIS<<32:
return lbIS, LineCanBreak, 310
case lbAny | prSY<<32:
return lbSY, LineCanBreak, 310
// LB14.
case lbAny | prOP<<32:
return lbOP, LineCanBreak, 310
case lbOP | prSP<<32:
return lbOP, LineDontBreak, 70
case lbOP | prAny<<32:
return lbAny, LineDontBreak, 140
// LB15.
case lbQU | prSP<<32:
return lbQUSP, LineDontBreak, 70
case lbQU | prOP<<32:
return lbOP, LineDontBreak, 150
case lbQUSP | prOP<<32:
return lbOP, LineDontBreak, 150
// LB16.
case lbCL | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbNUCL | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbCP | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbNUCP | prSP<<32:
return lbCLCPSP, LineDontBreak, 70
case lbCL | prNS<<32:
return lbNS, LineDontBreak, 160
case lbNUCL | prNS<<32:
return lbNS, LineDontBreak, 160
case lbCP | prNS<<32:
return lbNS, LineDontBreak, 160
case lbNUCP | prNS<<32:
return lbNS, LineDontBreak, 160
case lbCLCPSP | prNS<<32:
return lbNS, LineDontBreak, 160
// LB17.
case lbAny | prB2<<32:
return lbB2, LineCanBreak, 310
case lbB2 | prSP<<32:
return lbB2SP, LineDontBreak, 70
case lbB2 | prB2<<32:
return lbB2, LineDontBreak, 170
case lbB2SP | prB2<<32:
return lbB2, LineDontBreak, 170
// LB18.
case lbSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbQUSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbCLCPSP | prAny<<32:
return lbAny, LineCanBreak, 180
case lbB2SP | prAny<<32:
return lbAny, LineCanBreak, 180
// LB19.
case lbAny | prQU<<32:
return lbQU, LineDontBreak, 190
case lbQU | prAny<<32:
return lbAny, LineDontBreak, 190
// LB20.
case lbAny | prCB<<32:
return lbCB, LineCanBreak, 200
case lbCB | prAny<<32:
return lbAny, LineCanBreak, 200
// LB21.
case lbAny | prBA<<32:
return lbBA, LineDontBreak, 210
case lbAny | prHY<<32:
return lbHY, LineDontBreak, 210
case lbAny | prNS<<32:
return lbNS, LineDontBreak, 210
case lbAny | prBB<<32:
return lbBB, LineCanBreak, 310
case lbBB | prAny<<32:
return lbAny, LineDontBreak, 210
// LB21a.
case lbAny | prHL<<32:
return lbHL, LineCanBreak, 310
case lbHL | prHY<<32:
return lbLB21a, LineDontBreak, 210
case lbHL | prBA<<32:
return lbLB21a, LineDontBreak, 210
case lbLB21a | prAny<<32:
return lbAny, LineDontBreak, 211
// LB21b.
case lbSY | prHL<<32:
return lbHL, LineDontBreak, 212
case lbNUSY | prHL<<32:
return lbHL, LineDontBreak, 212
// LB22.
case lbAny | prIN<<32:
return lbAny, LineDontBreak, 220
// LB23.
case lbAny | prAL<<32:
return lbAL, LineCanBreak, 310
case lbAny | prNU<<32:
return lbNU, LineCanBreak, 310
case lbAL | prNU<<32:
return lbNU, LineDontBreak, 230
case lbHL | prNU<<32:
return lbNU, LineDontBreak, 230
case lbNU | prAL<<32:
return lbAL, LineDontBreak, 230
case lbNU | prHL<<32:
return lbHL, LineDontBreak, 230
case lbNUNU | prAL<<32:
return lbAL, LineDontBreak, 230
case lbNUNU | prHL<<32:
return lbHL, LineDontBreak, 230
// LB23a.
case lbAny | prPR<<32:
return lbPR, LineCanBreak, 310
case lbAny | prID<<32:
return lbIDEM, LineCanBreak, 310
case lbAny | prEB<<32:
return lbEB, LineCanBreak, 310
case lbAny | prEM<<32:
return lbIDEM, LineCanBreak, 310
case lbPR | prID<<32:
return lbIDEM, LineDontBreak, 231
case lbPR | prEB<<32:
return lbEB, LineDontBreak, 231
case lbPR | prEM<<32:
return lbIDEM, LineDontBreak, 231
case lbIDEM | prPO<<32:
return lbPO, LineDontBreak, 231
case lbEB | prPO<<32:
return lbPO, LineDontBreak, 231
// LB24.
case lbAny | prPO<<32:
return lbPO, LineCanBreak, 310
case lbPR | prAL<<32:
return lbAL, LineDontBreak, 240
case lbPR | prHL<<32:
return lbHL, LineDontBreak, 240
case lbPO | prAL<<32:
return lbAL, LineDontBreak, 240
case lbPO | prHL<<32:
return lbHL, LineDontBreak, 240
case lbAL | prPR<<32:
return lbPR, LineDontBreak, 240
case lbAL | prPO<<32:
return lbPO, LineDontBreak, 240
case lbHL | prPR<<32:
return lbPR, LineDontBreak, 240
case lbHL | prPO<<32:
return lbPO, LineDontBreak, 240
// LB25 (simple transitions).
case lbPR | prNU<<32:
return lbNU, LineDontBreak, 250
case lbPO | prNU<<32:
return lbNU, LineDontBreak, 250
case lbOP | prNU<<32:
return lbNU, LineDontBreak, 250
case lbHY | prNU<<32:
return lbNU, LineDontBreak, 250
case lbNU | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNU | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNU | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUNU | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUNU | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUNU | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUSY | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUSY | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUSY | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNUIS | prNU<<32:
return lbNUNU, LineDontBreak, 250
case lbNUIS | prSY<<32:
return lbNUSY, LineDontBreak, 250
case lbNUIS | prIS<<32:
return lbNUIS, LineDontBreak, 250
case lbNU | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNU | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUNU | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUNU | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUSY | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUSY | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNUIS | prCL<<32:
return lbNUCL, LineDontBreak, 250
case lbNUIS | prCP<<32:
return lbNUCP, LineDontBreak, 250
case lbNU | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUNU | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUSY | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUIS | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUCL | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNUCP | prPO<<32:
return lbPO, LineDontBreak, 250
case lbNU | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUNU | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUSY | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUIS | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUCL | prPR<<32:
return lbPR, LineDontBreak, 250
case lbNUCP | prPR<<32:
return lbPR, LineDontBreak, 250
// LB26.
case lbAny | prJL<<32:
return lbJL, LineCanBreak, 310
case lbAny | prJV<<32:
return lbJV, LineCanBreak, 310
case lbAny | prJT<<32:
return lbJT, LineCanBreak, 310
case lbAny | prH2<<32:
return lbH2, LineCanBreak, 310
case lbAny | prH3<<32:
return lbH3, LineCanBreak, 310
case lbJL | prJL<<32:
return lbJL, LineDontBreak, 260
case lbJL | prJV<<32:
return lbJV, LineDontBreak, 260
case lbJL | prH2<<32:
return lbH2, LineDontBreak, 260
case lbJL | prH3<<32:
return lbH3, LineDontBreak, 260
case lbJV | prJV<<32:
return lbJV, LineDontBreak, 260
case lbJV | prJT<<32:
return lbJT, LineDontBreak, 260
case lbH2 | prJV<<32:
return lbJV, LineDontBreak, 260
case lbH2 | prJT<<32:
return lbJT, LineDontBreak, 260
case lbJT | prJT<<32:
return lbJT, LineDontBreak, 260
case lbH3 | prJT<<32:
return lbJT, LineDontBreak, 260
// LB27.
case lbJL | prPO<<32:
return lbPO, LineDontBreak, 270
case lbJV | prPO<<32:
return lbPO, LineDontBreak, 270
case lbJT | prPO<<32:
return lbPO, LineDontBreak, 270
case lbH2 | prPO<<32:
return lbPO, LineDontBreak, 270
case lbH3 | prPO<<32:
return lbPO, LineDontBreak, 270
case lbPR | prJL<<32:
return lbJL, LineDontBreak, 270
case lbPR | prJV<<32:
return lbJV, LineDontBreak, 270
case lbPR | prJT<<32:
return lbJT, LineDontBreak, 270
case lbPR | prH2<<32:
return lbH2, LineDontBreak, 270
case lbPR | prH3<<32:
return lbH3, LineDontBreak, 270
// LB28.
case lbAL | prAL<<32:
return lbAL, LineDontBreak, 280
case lbAL | prHL<<32:
return lbHL, LineDontBreak, 280
case lbHL | prAL<<32:
return lbAL, LineDontBreak, 280
case lbHL | prHL<<32:
return lbHL, LineDontBreak, 280
// LB29.
case lbIS | prAL<<32:
return lbAL, LineDontBreak, 290
case lbIS | prHL<<32:
return lbHL, LineDontBreak, 290
case lbNUIS | prAL<<32:
return lbAL, LineDontBreak, 290
case lbNUIS | prHL<<32:
return lbHL, LineDontBreak, 290
default:
return -1, -1, -1
}
}
// transitionLineBreakState determines the new state of the line break parser
// given the current state and the next code point. It also returns the type of
// line break: LineDontBreak, LineCanBreak, or LineMustBreak. If more than one
// code point is needed to determine the new state, the byte slice or the string
// starting after rune "r" can be used (whichever is not nil or empty) for
// further lookups.
func transitionLineBreakState(state int, r rune, b []byte, str string) (newState int, lineBreak int) {
// Determine the property of the next character.
nextProperty, generalCategory := propertyLineBreak(r)
// Prepare.
var forceNoBreak, isCPeaFWH bool
if state >= 0 && state&lbCPeaFWHBit != 0 {
isCPeaFWH = true // LB30: CP but ea is not F, W, or H.
state = state &^ lbCPeaFWHBit
}
if state >= 0 && state&lbZWJBit != 0 {
state = state &^ lbZWJBit // Extract zero-width joiner bit.
forceNoBreak = true // LB8a.
}
defer func() {
// Transition into LB30.
if newState == lbCP || newState == lbNUCP {
ea := propertyEastAsianWidth(r)
if ea != prF && ea != prW && ea != prH {
newState |= lbCPeaFWHBit
}
}
// Override break.
if forceNoBreak {
lineBreak = LineDontBreak
}
}()
// LB1.
if nextProperty == prAI || nextProperty == prSG || nextProperty == prXX {
nextProperty = prAL
} else if nextProperty == prSA {
if generalCategory == gcMn || generalCategory == gcMc {
nextProperty = prCM
} else {
nextProperty = prAL
}
} else if nextProperty == prCJ {
nextProperty = prNS
}
// Combining marks.
if nextProperty == prZWJ || nextProperty == prCM {
var bit int
if nextProperty == prZWJ {
bit = lbZWJBit
}
mustBreakState := state < 0 || state == lbBK || state == lbCR || state == lbLF || state == lbNL
if !mustBreakState && state != lbSP && state != lbZW && state != lbQUSP && state != lbCLCPSP && state != lbB2SP {
// LB9.
return state | bit, LineDontBreak
} else {
// LB10.
if mustBreakState {
return lbAL | bit, LineMustBreak
}
return lbAL | bit, LineCanBreak
}
}
// Find the applicable transition in the table.
var rule int
newState, lineBreak, rule = lbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
anyPropProp, anyPropLineBreak, anyPropRule := lbTransitions(state, prAny)
anyStateProp, anyStateLineBreak, anyStateRule := lbTransitions(lbAny, nextProperty)
if anyPropProp >= 0 && anyStateProp >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
if anyPropRule < anyStateRule {
lineBreak, rule = anyPropLineBreak, anyPropRule
}
} else if anyPropProp >= 0 {
// We only have a specific state.
newState, lineBreak, rule = anyPropProp, anyPropLineBreak, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if anyStateProp >= 0 {
// We only have a specific property.
newState, lineBreak, rule = anyStateProp, anyStateLineBreak, anyStateRule
} else {
// No known transition. LB31: ALL ÷ ALL.
newState, lineBreak, rule = lbAny, LineCanBreak, 310
}
}
// LB12a.
if rule > 121 &&
nextProperty == prGL &&
(state != lbSP && state != lbBA && state != lbHY && state != lbLB21a && state != lbQUSP && state != lbCLCPSP && state != lbB2SP) {
return lbGL, LineDontBreak
}
// LB13.
if rule > 130 && state != lbNU && state != lbNUNU {
switch nextProperty {
case prCL:
return lbCL, LineDontBreak
case prCP:
return lbCP, LineDontBreak
case prIS:
return lbIS, LineDontBreak
case prSY:
return lbSY, LineDontBreak
}
}
// LB25 (look ahead).
if rule > 250 &&
(state == lbPR || state == lbPO) &&
nextProperty == prOP || nextProperty == prHY {
var r rune
if b != nil { // Byte slice version.
r, _ = utf8.DecodeRune(b)
} else { // String version.
r, _ = utf8.DecodeRuneInString(str)
}
if r != utf8.RuneError {
pr, _ := propertyLineBreak(r)
if pr == prNU {
return lbNU, LineDontBreak
}
}
}
// LB30 (part one).
if rule > 300 {
if (state == lbAL || state == lbHL || state == lbNU || state == lbNUNU) && nextProperty == prOP {
ea := propertyEastAsianWidth(r)
if ea != prF && ea != prW && ea != prH {
return lbOP, LineDontBreak
}
} else if isCPeaFWH {
switch nextProperty {
case prAL:
return lbAL, LineDontBreak
case prHL:
return lbHL, LineDontBreak
case prNU:
return lbNU, LineDontBreak
}
}
}
// LB30a.
if newState == lbAny && nextProperty == prRI {
if state != lbOddRI && state != lbEvenRI { // Includes state == -1.
// Transition into the first RI.
return lbOddRI, lineBreak
}
if state == lbOddRI {
// Don't break pairs of Regional Indicators.
return lbEvenRI, LineDontBreak
}
return lbOddRI, lineBreak
}
// LB30b.
if rule > 302 {
if nextProperty == prEM {
if state == lbEB || state == lbExtPicCn {
return prAny, LineDontBreak
}
}
graphemeProperty := propertyGraphemes(r)
if graphemeProperty == prExtendedPictographic && generalCategory == gcCn {
return lbExtPicCn, LineCanBreak
}
}
return
}

208
vendor/github.com/rivo/uniseg/properties.go generated vendored Normal file
View File

@ -0,0 +1,208 @@
package uniseg
// The Unicode properties as used in the various parsers. Only the ones needed
// in the context of this package are included.
const (
prXX = 0 // Same as prAny.
prAny = iota // prAny must be 0.
prPrepend // Grapheme properties must come first, to reduce the number of bits stored in the state vector.
prCR
prLF
prControl
prExtend
prRegionalIndicator
prSpacingMark
prL
prV
prT
prLV
prLVT
prZWJ
prExtendedPictographic
prNewline
prWSegSpace
prDoubleQuote
prSingleQuote
prMidNumLet
prNumeric
prMidLetter
prMidNum
prExtendNumLet
prALetter
prFormat
prHebrewLetter
prKatakana
prSp
prSTerm
prClose
prSContinue
prATerm
prUpper
prLower
prSep
prOLetter
prCM
prBA
prBK
prSP
prEX
prQU
prAL
prPR
prPO
prOP
prCP
prIS
prHY
prSY
prNU
prCL
prNL
prGL
prAI
prBB
prHL
prSA
prJL
prJV
prJT
prNS
prZW
prB2
prIN
prWJ
prID
prEB
prCJ
prH2
prH3
prSG
prCB
prRI
prEM
prN
prNa
prA
prW
prH
prF
prEmojiPresentation
)
// Unicode General Categories. Only the ones needed in the context of this
// package are included.
const (
gcNone = iota // gcNone must be 0.
gcCc
gcZs
gcPo
gcSc
gcPs
gcPe
gcSm
gcPd
gcNd
gcLu
gcSk
gcPc
gcLl
gcSo
gcLo
gcPi
gcCf
gcNo
gcPf
gcLC
gcLm
gcMn
gcMe
gcMc
gcNl
gcZl
gcZp
gcCn
gcCs
gcCo
)
// Special code points.
const (
vs15 = 0xfe0e // Variation Selector-15 (text presentation)
vs16 = 0xfe0f // Variation Selector-16 (emoji presentation)
)
// propertySearch performs a binary search on a property slice and returns the
// entry whose range (start = first array element, end = second array element)
// includes r, or an array of 0's if no such entry was found.
func propertySearch[E interface{ [3]int | [4]int }](dictionary []E, r rune) (result E) {
// Run a binary search.
from := 0
to := len(dictionary)
for to > from {
middle := (from + to) / 2
cpRange := dictionary[middle]
if int(r) < cpRange[0] {
to = middle
continue
}
if int(r) > cpRange[1] {
from = middle + 1
continue
}
return cpRange
}
return
}
// property returns the Unicode property value (see constants above) of the
// given code point.
func property(dictionary [][3]int, r rune) int {
return propertySearch(dictionary, r)[2]
}
// propertyLineBreak returns the Unicode property value and General Category
// (see constants above) of the given code point, as listed in the line break
// code points table, while fast tracking ASCII digits and letters.
func propertyLineBreak(r rune) (property, generalCategory int) {
if r >= 'a' && r <= 'z' {
return prAL, gcLl
}
if r >= 'A' && r <= 'Z' {
return prAL, gcLu
}
if r >= '0' && r <= '9' {
return prNU, gcNd
}
entry := propertySearch(lineBreakCodePoints, r)
return entry[2], entry[3]
}
// propertyGraphemes returns the Unicode grapheme cluster property value of the
// given code point while fast tracking ASCII characters.
func propertyGraphemes(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prAny
}
if r == 0x0a {
return prLF
}
if r == 0x0d {
return prCR
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prControl
}
return property(graphemeCodePoints, r)
}
// propertyEastAsianWidth returns the Unicode East Asian Width property value of
// the given code point while fast tracking ASCII characters.
func propertyEastAsianWidth(r rune) int {
if r >= 0x20 && r <= 0x7e {
return prNa
}
if r >= 0 && r <= 0x1f || r == 0x7f {
return prN
}
return property(eastAsianWidth, r)
}

90
vendor/github.com/rivo/uniseg/sentence.go generated vendored Normal file
View File

@ -0,0 +1,90 @@
package uniseg
import "unicode/utf8"
// FirstSentence returns the first sentence found in the given byte slice
// according to the rules of [Unicode Standard Annex #29, Sentence Boundaries].
// This function can be called continuously to extract all sentences from a byte
// slice, as illustrated in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified sentence. If the length of the "rest"
// slice is 0, the entire byte slice "b" has been processed. The "sentence" byte
// slice is the sub-slice of the input slice containing the identified sentence.
//
// Given an empty byte slice "b", the function returns nil values.
//
// [Unicode Standard Annex #29, Sentence Boundaries]: http://unicode.org/reports/tr29/#Sentence_Boundaries
func FirstSentence(b []byte, state int) (sentence, rest []byte, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
return b, nil, sbAny
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionSentenceBreakState(state, r, b[length:], "")
}
// Transition until we find a boundary.
var boundary bool
for {
r, l := utf8.DecodeRune(b[length:])
state, boundary = transitionSentenceBreakState(state, r, b[length+l:], "")
if boundary {
return b[:length], b[length:], state
}
length += l
if len(b) <= length {
return b, nil, sbAny
}
}
}
// FirstSentenceInString is like [FirstSentence] but its input and outputs are
// strings.
func FirstSentenceInString(str string, state int) (sentence, rest string, newState int) {
// An empty byte slice returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
return str, "", sbAny
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionSentenceBreakState(state, r, nil, str[length:])
}
// Transition until we find a boundary.
var boundary bool
for {
r, l := utf8.DecodeRuneInString(str[length:])
state, boundary = transitionSentenceBreakState(state, r, nil, str[length+l:])
if boundary {
return str[:length], str[length:], state
}
length += l
if len(str) <= length {
return str, "", sbAny
}
}
}

2845
vendor/github.com/rivo/uniseg/sentenceproperties.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

276
vendor/github.com/rivo/uniseg/sentencerules.go generated vendored Normal file
View File

@ -0,0 +1,276 @@
package uniseg
import "unicode/utf8"
// The states of the sentence break parser.
const (
sbAny = iota
sbCR
sbParaSep
sbATerm
sbUpper
sbLower
sbSB7
sbSB8Close
sbSB8Sp
sbSTerm
sbSB8aClose
sbSB8aSp
)
// sbTransitions implements the sentence break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// SB3.
case sbAny | prCR<<32:
return sbCR, false, 9990
case sbCR | prLF<<32:
return sbParaSep, false, 30
// SB4.
case sbAny | prSep<<32:
return sbParaSep, false, 9990
case sbAny | prLF<<32:
return sbParaSep, false, 9990
case sbParaSep | prAny<<32:
return sbAny, true, 40
case sbCR | prAny<<32:
return sbAny, true, 40
// SB6.
case sbAny | prATerm<<32:
return sbATerm, false, 9990
case sbATerm | prNumeric<<32:
return sbAny, false, 60
case sbSB7 | prNumeric<<32:
return sbAny, false, 60 // Because ATerm also appears in SB7.
// SB7.
case sbAny | prUpper<<32:
return sbUpper, false, 9990
case sbAny | prLower<<32:
return sbLower, false, 9990
case sbUpper | prATerm<<32:
return sbSB7, false, 70
case sbLower | prATerm<<32:
return sbSB7, false, 70
case sbSB7 | prUpper<<32:
return sbUpper, false, 70
// SB8a.
case sbAny | prSTerm<<32:
return sbSTerm, false, 9990
case sbATerm | prSContinue<<32:
return sbAny, false, 81
case sbATerm | prATerm<<32:
return sbATerm, false, 81
case sbATerm | prSTerm<<32:
return sbSTerm, false, 81
case sbSB7 | prSContinue<<32:
return sbAny, false, 81
case sbSB7 | prATerm<<32:
return sbATerm, false, 81
case sbSB7 | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8Close | prSContinue<<32:
return sbAny, false, 81
case sbSB8Close | prATerm<<32:
return sbATerm, false, 81
case sbSB8Close | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8Sp | prSContinue<<32:
return sbAny, false, 81
case sbSB8Sp | prATerm<<32:
return sbATerm, false, 81
case sbSB8Sp | prSTerm<<32:
return sbSTerm, false, 81
case sbSTerm | prSContinue<<32:
return sbAny, false, 81
case sbSTerm | prATerm<<32:
return sbATerm, false, 81
case sbSTerm | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8aClose | prSContinue<<32:
return sbAny, false, 81
case sbSB8aClose | prATerm<<32:
return sbATerm, false, 81
case sbSB8aClose | prSTerm<<32:
return sbSTerm, false, 81
case sbSB8aSp | prSContinue<<32:
return sbAny, false, 81
case sbSB8aSp | prATerm<<32:
return sbATerm, false, 81
case sbSB8aSp | prSTerm<<32:
return sbSTerm, false, 81
// SB9.
case sbATerm | prClose<<32:
return sbSB8Close, false, 90
case sbSB7 | prClose<<32:
return sbSB8Close, false, 90
case sbSB8Close | prClose<<32:
return sbSB8Close, false, 90
case sbATerm | prSp<<32:
return sbSB8Sp, false, 90
case sbSB7 | prSp<<32:
return sbSB8Sp, false, 90
case sbSB8Close | prSp<<32:
return sbSB8Sp, false, 90
case sbSTerm | prClose<<32:
return sbSB8aClose, false, 90
case sbSB8aClose | prClose<<32:
return sbSB8aClose, false, 90
case sbSTerm | prSp<<32:
return sbSB8aSp, false, 90
case sbSB8aClose | prSp<<32:
return sbSB8aSp, false, 90
case sbATerm | prSep<<32:
return sbParaSep, false, 90
case sbATerm | prCR<<32:
return sbParaSep, false, 90
case sbATerm | prLF<<32:
return sbParaSep, false, 90
case sbSB7 | prSep<<32:
return sbParaSep, false, 90
case sbSB7 | prCR<<32:
return sbParaSep, false, 90
case sbSB7 | prLF<<32:
return sbParaSep, false, 90
case sbSB8Close | prSep<<32:
return sbParaSep, false, 90
case sbSB8Close | prCR<<32:
return sbParaSep, false, 90
case sbSB8Close | prLF<<32:
return sbParaSep, false, 90
case sbSTerm | prSep<<32:
return sbParaSep, false, 90
case sbSTerm | prCR<<32:
return sbParaSep, false, 90
case sbSTerm | prLF<<32:
return sbParaSep, false, 90
case sbSB8aClose | prSep<<32:
return sbParaSep, false, 90
case sbSB8aClose | prCR<<32:
return sbParaSep, false, 90
case sbSB8aClose | prLF<<32:
return sbParaSep, false, 90
// SB10.
case sbSB8Sp | prSp<<32:
return sbSB8Sp, false, 100
case sbSB8aSp | prSp<<32:
return sbSB8aSp, false, 100
case sbSB8Sp | prSep<<32:
return sbParaSep, false, 100
case sbSB8Sp | prCR<<32:
return sbParaSep, false, 100
case sbSB8Sp | prLF<<32:
return sbParaSep, false, 100
// SB11.
case sbATerm | prAny<<32:
return sbAny, true, 110
case sbSB7 | prAny<<32:
return sbAny, true, 110
case sbSB8Close | prAny<<32:
return sbAny, true, 110
case sbSB8Sp | prAny<<32:
return sbAny, true, 110
case sbSTerm | prAny<<32:
return sbAny, true, 110
case sbSB8aClose | prAny<<32:
return sbAny, true, 110
case sbSB8aSp | prAny<<32:
return sbAny, true, 110
// We'll always break after ParaSep due to SB4.
default:
return -1, false, -1
}
}
// transitionSentenceBreakState determines the new state of the sentence break
// parser given the current state and the next code point. It also returns
// whether a sentence boundary was detected. If more than one code point is
// needed to determine the new state, the byte slice or the string starting
// after rune "r" can be used (whichever is not nil or empty) for further
// lookups.
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
// Determine the property of the next character.
nextProperty := property(sentenceBreakCodePoints, r)
// SB5 (Replacing Ignore Rules).
if nextProperty == prExtend || nextProperty == prFormat {
if state == sbParaSep || state == sbCR {
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
}
if state < 0 {
return sbAny, true // SB1.
}
return state, false
}
// Find the applicable transition in the table.
var rule int
newState, sentenceBreak, rule = sbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny)
anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
if anyPropRule < anyStateRule {
sentenceBreak, rule = anyPropProp, anyPropRule
}
} else if anyPropState >= 0 {
// We only have a specific state.
newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if anyStateState >= 0 {
// We only have a specific property.
newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule
} else {
// No known transition. SB999: Any × Any.
newState, sentenceBreak, rule = sbAny, false, 9990
}
}
// SB8.
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
// Check the right side of the rule.
var length int
for nextProperty != prOLetter &&
nextProperty != prUpper &&
nextProperty != prLower &&
nextProperty != prSep &&
nextProperty != prCR &&
nextProperty != prLF &&
nextProperty != prATerm &&
nextProperty != prSTerm {
// Move on to the next rune.
if b != nil { // Byte slice version.
r, length = utf8.DecodeRune(b)
b = b[length:]
} else { // String version.
r, length = utf8.DecodeRuneInString(str)
str = str[length:]
}
if r == utf8.RuneError {
break
}
nextProperty = property(sentenceBreakCodePoints, r)
}
if nextProperty == prLower {
return sbLower, false
}
}
return
}

242
vendor/github.com/rivo/uniseg/step.go generated vendored Normal file
View File

@ -0,0 +1,242 @@
package uniseg
import "unicode/utf8"
// The bit masks used to extract boundary information returned by [Step].
const (
MaskLine = 3
MaskWord = 4
MaskSentence = 8
)
// The number of bits to shift the boundary information returned by [Step] to
// obtain the monospace width of the grapheme cluster.
const ShiftWidth = 4
// The bit positions by which boundary flags are shifted by the [Step] function.
// These must correspond to the Mask constants.
const (
shiftWord = 2
shiftSentence = 3
// shiftwWidth is ShiftWidth above. No mask as these are always the remaining bits.
)
// The bit positions by which states are shifted by the [Step] function. These
// values must ensure state values defined for each of the boundary algorithms
// don't overlap (and that they all still fit in a single int). These must
// correspond to the Mask constants.
const (
shiftWordState = 4
shiftSentenceState = 9
shiftLineState = 13
shiftPropState = 21 // No mask as these are always the remaining bits.
)
// The bit mask used to extract the state returned by the [Step] function, after
// shifting. These values must correspond to the shift constants.
const (
maskGraphemeState = 0xf
maskWordState = 0x1f
maskSentenceState = 0xf
maskLineState = 0xff
)
// Step returns the first grapheme cluster (user-perceived character) found in
// the given byte slice. It also returns information about the boundary between
// that grapheme cluster and the one following it as well as the monospace width
// of the grapheme cluster. There are three types of boundary information: word
// boundaries, sentence boundaries, and line breaks. This function is therefore
// a combination of [FirstGraphemeCluster], [FirstWord], [FirstSentence], and
// [FirstLineSegment].
//
// The "boundaries" return value can be evaluated as follows:
//
// - boundaries&MaskWord != 0: The boundary is a word boundary.
// - boundaries&MaskWord == 0: The boundary is not a word boundary.
// - boundaries&MaskSentence != 0: The boundary is a sentence boundary.
// - boundaries&MaskSentence == 0: The boundary is not a sentence boundary.
// - boundaries&MaskLine == LineDontBreak: You must not break the line at the
// boundary.
// - boundaries&MaskLine == LineMustBreak: You must break the line at the
// boundary.
// - boundaries&MaskLine == LineCanBreak: You may or may not break the line at
// the boundary.
// - boundaries >> ShiftWidth: The width of the grapheme cluster for most
// monospace fonts where a value of 1 represents one character cell.
//
// This function can be called continuously to extract all grapheme clusters
// from a byte slice, as illustrated in the examples below.
//
// If you don't know which state to pass, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified grapheme cluster. If the length of the
// "rest" slice is 0, the entire byte slice "b" has been processed. The
// "cluster" byte slice is the sub-slice of the input slice containing the
// first identified grapheme cluster.
//
// Given an empty byte slice "b", the function returns nil values.
//
// While slightly less convenient than using the Graphemes class, this function
// has much better performance and makes no allocations. It lends itself well to
// large byte slices.
//
// Note that in accordance with [UAX #14 LB3], the final segment will end with
// a mandatory line break (boundaries&MaskLine == LineMustBreak). You can choose
// to ignore this by checking if the length of the "rest" slice is 0 and calling
// [HasTrailingLineBreak] or [HasTrailingLineBreakInString] on the last rune.
//
// [UAX #14 LB3]: https://www.unicode.org/reports/tr14/#Algorithm
func Step(b []byte, state int) (cluster, rest []byte, boundaries int, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
var prop int
if state < 0 {
prop = propertyGraphemes(r)
} else {
prop = state >> shiftPropState
}
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
// If we don't know the state, determine it now.
var graphemeState, wordState, sentenceState, lineState, firstProp int
remainder := b[length:]
if state < 0 {
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
wordState, _ = transitionWordBreakState(state, r, remainder, "")
sentenceState, _ = transitionSentenceBreakState(state, r, remainder, "")
lineState, _ = transitionLineBreakState(state, r, remainder, "")
} else {
graphemeState = state & maskGraphemeState
wordState = (state >> shiftWordState) & maskWordState
sentenceState = (state >> shiftSentenceState) & maskSentenceState
lineState = (state >> shiftLineState) & maskLineState
firstProp = state >> shiftPropState
}
// Transition until we find a grapheme cluster boundary.
width := runeWidth(r, firstProp)
for {
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak, prop int
)
r, l := utf8.DecodeRune(remainder)
remainder = b[length+l:]
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
wordState, wordBoundary = transitionWordBreakState(wordState, r, remainder, "")
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, remainder, "")
lineState, lineBreak = transitionLineBreakState(lineState, r, remainder, "")
if graphemeBoundary {
boundary := lineBreak | (width << ShiftWidth)
if wordBoundary {
boundary |= 1 << shiftWord
}
if sentenceBoundary {
boundary |= 1 << shiftSentence
}
return b[:length], b[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(b) <= length {
return b, nil, LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
}
}
// StepString is like [Step] but its input and outputs are strings.
func StepString(str string, state int) (cluster, rest string, boundaries int, newState int) {
// An empty byte slice returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
prop := propertyGraphemes(r)
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (runeWidth(r, prop) << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState)
}
// If we don't know the state, determine it now.
var graphemeState, wordState, sentenceState, lineState, firstProp int
remainder := str[length:]
if state < 0 {
graphemeState, firstProp, _ = transitionGraphemeState(state, r)
wordState, _ = transitionWordBreakState(state, r, nil, remainder)
sentenceState, _ = transitionSentenceBreakState(state, r, nil, remainder)
lineState, _ = transitionLineBreakState(state, r, nil, remainder)
} else {
graphemeState = state & maskGraphemeState
wordState = (state >> shiftWordState) & maskWordState
sentenceState = (state >> shiftSentenceState) & maskSentenceState
lineState = (state >> shiftLineState) & maskLineState
firstProp = state >> shiftPropState
}
// Transition until we find a grapheme cluster boundary.
width := runeWidth(r, firstProp)
for {
var (
graphemeBoundary, wordBoundary, sentenceBoundary bool
lineBreak, prop int
)
r, l := utf8.DecodeRuneInString(remainder)
remainder = str[length+l:]
graphemeState, prop, graphemeBoundary = transitionGraphemeState(graphemeState, r)
wordState, wordBoundary = transitionWordBreakState(wordState, r, nil, remainder)
sentenceState, sentenceBoundary = transitionSentenceBreakState(sentenceState, r, nil, remainder)
lineState, lineBreak = transitionLineBreakState(lineState, r, nil, remainder)
if graphemeBoundary {
boundary := lineBreak | (width << ShiftWidth)
if wordBoundary {
boundary |= 1 << shiftWord
}
if sentenceBoundary {
boundary |= 1 << shiftSentence
}
return str[:length], str[length:], boundary, graphemeState | (wordState << shiftWordState) | (sentenceState << shiftSentenceState) | (lineState << shiftLineState) | (prop << shiftPropState)
}
if firstProp == prExtendedPictographic {
if r == vs15 {
width = 1
} else if r == vs16 {
width = 2
}
} else if firstProp != prRegionalIndicator && firstProp != prL {
width += runeWidth(r, prop)
}
length += l
if len(str) <= length {
return str, "", LineMustBreak | (1 << shiftWord) | (1 << shiftSentence) | (width << ShiftWidth), grAny | (wbAny << shiftWordState) | (sbAny << shiftSentenceState) | (lbAny << shiftLineState) | (prop << shiftPropState)
}
}
}

61
vendor/github.com/rivo/uniseg/width.go generated vendored Normal file
View File

@ -0,0 +1,61 @@
package uniseg
// EastAsianAmbiguousWidth specifies the monospace width for East Asian
// characters classified as Ambiguous. The default is 1 but some rare fonts
// render them with a width of 2.
var EastAsianAmbiguousWidth = 1
// runeWidth returns the monospace width for the given rune. The provided
// grapheme property is a value mapped by the [graphemeCodePoints] table.
//
// Every rune has a width of 1, except for runes with the following properties
// (evaluated in this order):
//
// - Control, CR, LF, Extend, ZWJ: Width of 0
// - \u2e3a, TWO-EM DASH: Width of 3
// - \u2e3b, THREE-EM DASH: Width of 4
// - East-Asian width Fullwidth and Wide: Width of 2 (Ambiguous and Neutral
// have a width of 1)
// - Regional Indicator: Width of 2
// - Extended Pictographic: Width of 2, unless Emoji Presentation is "No".
func runeWidth(r rune, graphemeProperty int) int {
switch graphemeProperty {
case prControl, prCR, prLF, prExtend, prZWJ:
return 0
case prRegionalIndicator:
return 2
case prExtendedPictographic:
if property(emojiPresentation, r) == prEmojiPresentation {
return 2
}
return 1
}
switch r {
case 0x2e3a:
return 3
case 0x2e3b:
return 4
}
switch propertyEastAsianWidth(r) {
case prW, prF:
return 2
case prA:
return EastAsianAmbiguousWidth
}
return 1
}
// StringWidth returns the monospace width for the given string, that is, the
// number of same-size cells to be occupied by the string.
func StringWidth(s string) (width int) {
state := -1
for len(s) > 0 {
var w int
_, s, w, state = FirstGraphemeClusterInString(s, state)
width += w
}
return
}

89
vendor/github.com/rivo/uniseg/word.go generated vendored Normal file
View File

@ -0,0 +1,89 @@
package uniseg
import "unicode/utf8"
// FirstWord returns the first word found in the given byte slice according to
// the rules of [Unicode Standard Annex #29, Word Boundaries]. This function can
// be called continuously to extract all words from a byte slice, as illustrated
// in the example below.
//
// If you don't know the current state, for example when calling the function
// for the first time, you must pass -1. For consecutive calls, pass the state
// and rest slice returned by the previous call.
//
// The "rest" slice is the sub-slice of the original byte slice "b" starting
// after the last byte of the identified word. If the length of the "rest" slice
// is 0, the entire byte slice "b" has been processed. The "word" byte slice is
// the sub-slice of the input slice containing the identified word.
//
// Given an empty byte slice "b", the function returns nil values.
//
// [Unicode Standard Annex #29, Word Boundaries]: http://unicode.org/reports/tr29/#Word_Boundaries
func FirstWord(b []byte, state int) (word, rest []byte, newState int) {
// An empty byte slice returns nothing.
if len(b) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRune(b)
if len(b) <= length { // If we're already past the end, there is nothing else to parse.
return b, nil, wbAny
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionWordBreakState(state, r, b[length:], "")
}
// Transition until we find a boundary.
var boundary bool
for {
r, l := utf8.DecodeRune(b[length:])
state, boundary = transitionWordBreakState(state, r, b[length+l:], "")
if boundary {
return b[:length], b[length:], state
}
length += l
if len(b) <= length {
return b, nil, wbAny
}
}
}
// FirstWordInString is like [FirstWord] but its input and outputs are strings.
func FirstWordInString(str string, state int) (word, rest string, newState int) {
// An empty byte slice returns nothing.
if len(str) == 0 {
return
}
// Extract the first rune.
r, length := utf8.DecodeRuneInString(str)
if len(str) <= length { // If we're already past the end, there is nothing else to parse.
return str, "", wbAny
}
// If we don't know the state, determine it now.
if state < 0 {
state, _ = transitionWordBreakState(state, r, nil, str[length:])
}
// Transition until we find a boundary.
var boundary bool
for {
r, l := utf8.DecodeRuneInString(str[length:])
state, boundary = transitionWordBreakState(state, r, nil, str[length+l:])
if boundary {
return str[:length], str[length:], state
}
length += l
if len(str) <= length {
return str, "", wbAny
}
}
}

1883
vendor/github.com/rivo/uniseg/wordproperties.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

282
vendor/github.com/rivo/uniseg/wordrules.go generated vendored Normal file
View File

@ -0,0 +1,282 @@
package uniseg
import "unicode/utf8"
// The states of the word break parser.
const (
wbAny = iota
wbCR
wbLF
wbNewline
wbWSegSpace
wbHebrewLetter
wbALetter
wbWB7
wbWB7c
wbNumeric
wbWB11
wbKatakana
wbExtendNumLet
wbOddRI
wbEvenRI
wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
)
// wbTransitions implements the word break parser's state transitions. It's
// anologous to [grTransitions], see comments there for details.
//
// Unicode version 15.0.0.
func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
switch uint64(state) | uint64(prop)<<32 {
// WB3b.
case wbAny | prNewline<<32:
return wbNewline, true, 32
case wbAny | prCR<<32:
return wbCR, true, 32
case wbAny | prLF<<32:
return wbLF, true, 32
// WB3a.
case wbNewline | prAny<<32:
return wbAny, true, 31
case wbCR | prAny<<32:
return wbAny, true, 31
case wbLF | prAny<<32:
return wbAny, true, 31
// WB3.
case wbCR | prLF<<32:
return wbLF, false, 30
// WB3d.
case wbAny | prWSegSpace<<32:
return wbWSegSpace, true, 9990
case wbWSegSpace | prWSegSpace<<32:
return wbWSegSpace, false, 34
// WB5.
case wbAny | prALetter<<32:
return wbALetter, true, 9990
case wbAny | prHebrewLetter<<32:
return wbHebrewLetter, true, 9990
case wbALetter | prALetter<<32:
return wbALetter, false, 50
case wbALetter | prHebrewLetter<<32:
return wbHebrewLetter, false, 50
case wbHebrewLetter | prALetter<<32:
return wbALetter, false, 50
case wbHebrewLetter | prHebrewLetter<<32:
return wbHebrewLetter, false, 50
// WB7. Transitions to wbWB7 handled by transitionWordBreakState().
case wbWB7 | prALetter<<32:
return wbALetter, false, 70
case wbWB7 | prHebrewLetter<<32:
return wbHebrewLetter, false, 70
// WB7a.
case wbHebrewLetter | prSingleQuote<<32:
return wbAny, false, 71
// WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
case wbWB7c | prHebrewLetter<<32:
return wbHebrewLetter, false, 73
// WB8.
case wbAny | prNumeric<<32:
return wbNumeric, true, 9990
case wbNumeric | prNumeric<<32:
return wbNumeric, false, 80
// WB9.
case wbALetter | prNumeric<<32:
return wbNumeric, false, 90
case wbHebrewLetter | prNumeric<<32:
return wbNumeric, false, 90
// WB10.
case wbNumeric | prALetter<<32:
return wbALetter, false, 100
case wbNumeric | prHebrewLetter<<32:
return wbHebrewLetter, false, 100
// WB11. Transitions to wbWB11 handled by transitionWordBreakState().
case wbWB11 | prNumeric<<32:
return wbNumeric, false, 110
// WB13.
case wbAny | prKatakana<<32:
return wbKatakana, true, 9990
case wbKatakana | prKatakana<<32:
return wbKatakana, false, 130
// WB13a.
case wbAny | prExtendNumLet<<32:
return wbExtendNumLet, true, 9990
case wbALetter | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbHebrewLetter | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbNumeric | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbKatakana | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
case wbExtendNumLet | prExtendNumLet<<32:
return wbExtendNumLet, false, 131
// WB13b.
case wbExtendNumLet | prALetter<<32:
return wbALetter, false, 132
case wbExtendNumLet | prHebrewLetter<<32:
return wbHebrewLetter, false, 132
case wbExtendNumLet | prNumeric<<32:
return wbNumeric, false, 132
case wbExtendNumLet | prKatakana<<32:
return wbKatakana, false, 132
default:
return -1, false, -1
}
}
// transitionWordBreakState determines the new state of the word break parser
// given the current state and the next code point. It also returns whether a
// word boundary was detected. If more than one code point is needed to
// determine the new state, the byte slice or the string starting after rune "r"
// can be used (whichever is not nil or empty) for further lookups.
func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
// Determine the property of the next character.
nextProperty := property(workBreakCodePoints, r)
// "Replacing Ignore Rules".
if nextProperty == prZWJ {
// WB4 (for zero-width joiners).
if state == wbNewline || state == wbCR || state == wbLF {
return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
}
if state < 0 {
return wbAny | wbZWJBit, false
}
return state | wbZWJBit, false
} else if nextProperty == prExtend || nextProperty == prFormat {
// WB4 (for Extend and Format).
if state == wbNewline || state == wbCR || state == wbLF {
return wbAny, true // Make sure we don't apply WB4 to WB3a.
}
if state == wbWSegSpace || state == wbAny|wbZWJBit {
return wbAny, false // We don't break but this is also not WB3d or WB3c.
}
if state < 0 {
return wbAny, false
}
return state, false
} else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
// WB3c.
return wbAny, false
}
if state >= 0 {
state = state &^ wbZWJBit
}
// Find the applicable transition in the table.
var rule int
newState, wordBreak, rule = wbTransitions(state, nextProperty)
if newState < 0 {
// No specific transition found. Try the less specific ones.
anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
if anyPropState >= 0 && anyStateState >= 0 {
// Both apply. We'll use a mix (see comments for grTransitions).
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
if anyPropRule < anyStateRule {
wordBreak, rule = anyPropWordBreak, anyPropRule
}
} else if anyPropState >= 0 {
// We only have a specific state.
newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
// This branch will probably never be reached because okAnyState will
// always be true given the current transition map. But we keep it here
// for future modifications to the transition map where this may not be
// true anymore.
} else if anyStateState >= 0 {
// We only have a specific property.
newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
} else {
// No known transition. WB999: Any ÷ Any.
newState, wordBreak, rule = wbAny, true, 9990
}
}
// For those rules that need to look up runes further in the string, we
// determine the property after nextProperty, skipping over Format, Extend,
// and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
// be determined (because the text ends or the rune is faulty).
farProperty := -1
if rule > 60 &&
(state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
nextProperty == prDoubleQuote || // WB7b.
nextProperty == prMidNum) { // WB12.
for {
var (
r rune
length int
)
if b != nil { // Byte slice version.
r, length = utf8.DecodeRune(b)
b = b[length:]
} else { // String version.
r, length = utf8.DecodeRuneInString(str)
str = str[length:]
}
if r == utf8.RuneError {
break
}
prop := property(workBreakCodePoints, r)
if prop == prExtend || prop == prFormat || prop == prZWJ {
continue
}
farProperty = prop
break
}
}
// WB6.
if rule > 60 &&
(state == wbALetter || state == wbHebrewLetter) &&
(nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
(farProperty == prALetter || farProperty == prHebrewLetter) {
return wbWB7, false
}
// WB7b.
if rule > 72 &&
state == wbHebrewLetter &&
nextProperty == prDoubleQuote &&
farProperty == prHebrewLetter {
return wbWB7c, false
}
// WB12.
if rule > 120 &&
state == wbNumeric &&
(nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
farProperty == prNumeric {
return wbWB11, false
}
// WB15 and WB16.
if newState == wbAny && nextProperty == prRegionalIndicator {
if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
// Transition into the first RI.
return wbOddRI, true
}
if state == wbOddRI {
// Don't break pairs of Regional Indicators.
return wbEvenRI, false
}
return wbOddRI, true // We can break after a pair.
}
return
}

3
vendor/modules.txt vendored
View File

@ -597,6 +597,9 @@ github.com/quasoft/memstore
# github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec
## explicit; go 1.12
github.com/remyoudompheng/bigfft
# github.com/rivo/uniseg v0.4.7
## explicit; go 1.18
github.com/rivo/uniseg
# github.com/rogpeppe/go-internal v1.13.2-0.20241226121412-a5dc8ff20d0a
## explicit; go 1.22.0
github.com/rogpeppe/go-internal/fmtsort